From patchwork Mon Jun 6 22:37:19 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 1639650 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: bilbo.ozlabs.org; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.a=rsa-sha256 header.s=default header.b=dDaE0iSf; dkim-atps=neutral Authentication-Results: ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=sourceware.org (client-ip=2620:52:3:1:0:246e:9693:128c; helo=sourceware.org; envelope-from=libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Received: from sourceware.org (server2.sourceware.org [IPv6:2620:52:3:1:0:246e:9693:128c]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by bilbo.ozlabs.org (Postfix) with ESMTPS id 4LH7fl2xkHz9sFs for ; Tue, 7 Jun 2022 08:37:51 +1000 (AEST) Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 34CBF3839186 for ; Mon, 6 Jun 2022 22:37:49 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 34CBF3839186 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1654555069; bh=x0nTnxwPDriB2gH7SOOvKffPdUZ2yfam2iyTtWB13e8=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=dDaE0iSfTTroIvXBlWNCqZ0Ni/j6+4aySdAa6JOsgY3qP4aTqvGHsjFar8VPVo1Ct hrE+jEhYmoFQBbD4XcywphmCSr3qAon78B0z08aSd/j3MEHESJ+Bz+3x8Zl37/AK+D 1B43czoV+m6YXUEvJSrsjGxNghV0YcvUaaWayWyo= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pj1-x1035.google.com (mail-pj1-x1035.google.com [IPv6:2607:f8b0:4864:20::1035]) by sourceware.org (Postfix) with ESMTPS id 891C63839C4C for ; Mon, 6 Jun 2022 22:37:33 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 891C63839C4C Received: by mail-pj1-x1035.google.com with SMTP id k5-20020a17090a404500b001e8875e6242so3041329pjg.5 for ; Mon, 06 Jun 2022 15:37:33 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=x0nTnxwPDriB2gH7SOOvKffPdUZ2yfam2iyTtWB13e8=; b=U/xKJnxc7CAWiERrQvcpatZCKS4m2DDCoQcUfUJ1oaxXvvxcwXy7NKEMNEAhblNdEo LaHdrs5S9/k0559P1DvAMoxbPt+4dif9pDMxiIgi09PH+2N7Grv2AV/r5yFL20aIeczO 8p4bYvfc02LzPspTVAp8RTFJ5vGBS66eoU2tNBQNo/W4ize9iHXCvAlfsB4M9XV7+8IY JFosVnGfmvctBz0oleLb0Vue1n8ZHSukO5Of1AYny0lyOvlo+WiOPtfI5loFT91QaD+H FiJVDO3uBffQDtaHH7UO6H+4rbidIdw/zLV2XTrYLMrwyRwkSx7Au1B3ypDu09H2fLDB JX0g== X-Gm-Message-State: AOAM5300wL9OKtogAs781+npD7c1qSpBrM3mX9/42y+E+BlBTRXrpPdG aT7jRQAWHjP3A7DXZbOgbUNfeB+9sfg= X-Google-Smtp-Source: ABdhPJyNdvwgVJLIQqLi8FVbgsx/OOXj6k1ZS8td1uht/N0oUbMYyUO3K5xCuLVP2tH+DOvQQNv9qQ== X-Received: by 2002:a17:902:8f8f:b0:167:7fa1:60e2 with SMTP id z15-20020a1709028f8f00b001677fa160e2mr6961843plo.121.1654555052284; Mon, 06 Jun 2022 15:37:32 -0700 (PDT) Received: from noah-tgl.. ([2600:1010:b04a:6ef:921e:3108:9361:2ef8]) by smtp.gmail.com with ESMTPSA id e22-20020a17090a4a1600b001e345c579d5sm10417532pjh.26.2022.06.06.15.37.31 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 06 Jun 2022 15:37:31 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v4 1/8] x86: Create header for VEC classes in x86 strings library Date: Mon, 6 Jun 2022 15:37:19 -0700 Message-Id: <20220606223726.2082226-1-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20220603044229.2180216-2-goldstein.w.n@gmail.com> References: <20220603044229.2180216-2-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE, URIBL_BLACK autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org Sender: "Libc-alpha" This patch does not touch any existing code and is only meant to be a tool for future patches so that simple source files can more easily be maintained to target multiple VEC classes. There is no difference in the objdump of libc.so before and after this patch. --- sysdeps/x86_64/multiarch/avx-rtm-vecs.h | 34 ++++++++ sysdeps/x86_64/multiarch/avx-vecs.h | 47 +++++++++++ sysdeps/x86_64/multiarch/evex-vecs-common.h | 39 +++++++++ sysdeps/x86_64/multiarch/evex256-vecs.h | 35 ++++++++ sysdeps/x86_64/multiarch/evex512-vecs.h | 35 ++++++++ sysdeps/x86_64/multiarch/sse2-vecs.h | 47 +++++++++++ sysdeps/x86_64/multiarch/vec-macros.h | 90 +++++++++++++++++++++ 7 files changed, 327 insertions(+) create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h create mode 100644 sysdeps/x86_64/multiarch/evex-vecs-common.h create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h new file mode 100644 index 0000000000..3f531dd47f --- /dev/null +++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h @@ -0,0 +1,34 @@ +/* Common config for AVX-RTM VECs + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _AVX_RTM_VECS_H +#define _AVX_RTM_VECS_H 1 + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define USE_WITH_RTM 1 +#include "avx-vecs.h" + +#undef SECTION +#define SECTION(p) p##.avx.rtm + +#endif diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h new file mode 100644 index 0000000000..89680f5db8 --- /dev/null +++ b/sysdeps/x86_64/multiarch/avx-vecs.h @@ -0,0 +1,47 @@ +/* Common config for AVX VECs + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _AVX_VECS_H +#define _AVX_VECS_H 1 + +#ifdef VEC_SIZE +# error "Multiple VEC configs included!" +#endif + +#define VEC_SIZE 32 +#include "vec-macros.h" + +#define USE_WITH_AVX 1 +#define SECTION(p) p##.avx + +/* 4-byte mov instructions with AVX2. */ +#define MOV_SIZE 4 +/* 1 (ret) + 3 (vzeroupper). */ +#define RET_SIZE 4 +#define VZEROUPPER vzeroupper + +#define VMOVU vmovdqu +#define VMOVA vmovdqa +#define VMOVNT vmovntdq + +/* Often need to access xmm portion. */ +#define VEC_xmm VEC_any_xmm +#define VEC VEC_any_ymm + +#endif diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h new file mode 100644 index 0000000000..99806ebcd7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/evex-vecs-common.h @@ -0,0 +1,39 @@ +/* Common config for EVEX256 and EVEX512 VECs + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _EVEX_VECS_COMMON_H +#define _EVEX_VECS_COMMON_H 1 + +#include "vec-macros.h" + +/* 6-byte mov instructions with EVEX. */ +#define MOV_SIZE 6 +/* No vzeroupper needed. */ +#define RET_SIZE 1 +#define VZEROUPPER + +#define VMOVU vmovdqu64 +#define VMOVA vmovdqa64 +#define VMOVNT vmovntdq + +#define VEC_xmm VEC_hi_xmm +#define VEC_ymm VEC_hi_ymm +#define VEC_zmm VEC_hi_zmm + +#endif diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h new file mode 100644 index 0000000000..222ba46dc7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/evex256-vecs.h @@ -0,0 +1,35 @@ +/* Common config for EVEX256 VECs + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _EVEX256_VECS_H +#define _EVEX256_VECS_H 1 + +#ifdef VEC_SIZE +# error "Multiple VEC configs included!" +#endif + +#define VEC_SIZE 32 +#include "evex-vecs-common.h" + +#define USE_WITH_EVEX256 1 +#define SECTION(p) p##.evex + +#define VEC VEC_ymm + +#endif diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h new file mode 100644 index 0000000000..d1784d5368 --- /dev/null +++ b/sysdeps/x86_64/multiarch/evex512-vecs.h @@ -0,0 +1,35 @@ +/* Common config for EVEX512 VECs + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _EVEX512_VECS_H +#define _EVEX512_VECS_H 1 + +#ifdef VEC_SIZE +# error "Multiple VEC configs included!" +#endif + +#define VEC_SIZE 64 +#include "evex-vecs-common.h" + +#define USE_WITH_EVEX512 1 +#define SECTION(p) p##.evex512 + +#define VEC VEC_zmm + +#endif diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h new file mode 100644 index 0000000000..2b77a59d56 --- /dev/null +++ b/sysdeps/x86_64/multiarch/sse2-vecs.h @@ -0,0 +1,47 @@ +/* Common config for SSE2 VECs + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _SSE2_VECS_H +#define _SSE2_VECS_H 1 + +#ifdef VEC_SIZE +# error "Multiple VEC configs included!" +#endif + +#define VEC_SIZE 16 +#include "vec-macros.h" + +#define USE_WITH_SSE2 1 +#define SECTION(p) p + +/* 3-byte mov instructions with SSE2. */ +#define MOV_SIZE 3 +/* No vzeroupper needed. */ +#define RET_SIZE 1 +#define VZEROUPPER + +#define VMOVU movups +#define VMOVA movaps +#define VMOVNT movntdq + +#define VEC_xmm VEC_any_xmm +#define VEC VEC_any_xmm + + +#endif diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h new file mode 100644 index 0000000000..9f3ffecede --- /dev/null +++ b/sysdeps/x86_64/multiarch/vec-macros.h @@ -0,0 +1,90 @@ +/* Macro helpers for VEC_{type}({vec_num}) + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _VEC_MACROS_H +#define _VEC_MACROS_H 1 + +#ifndef VEC_SIZE +# error "Never include this file directly. Always include a vector config." +#endif + +/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same + VEC(N) values. */ +#define VEC_hi_xmm0 xmm16 +#define VEC_hi_xmm1 xmm17 +#define VEC_hi_xmm2 xmm18 +#define VEC_hi_xmm3 xmm19 +#define VEC_hi_xmm4 xmm20 +#define VEC_hi_xmm5 xmm21 +#define VEC_hi_xmm6 xmm22 +#define VEC_hi_xmm7 xmm23 +#define VEC_hi_xmm8 xmm24 +#define VEC_hi_xmm9 xmm25 +#define VEC_hi_xmm10 xmm26 +#define VEC_hi_xmm11 xmm27 +#define VEC_hi_xmm12 xmm28 +#define VEC_hi_xmm13 xmm29 +#define VEC_hi_xmm14 xmm30 +#define VEC_hi_xmm15 xmm31 + +#define VEC_hi_ymm0 ymm16 +#define VEC_hi_ymm1 ymm17 +#define VEC_hi_ymm2 ymm18 +#define VEC_hi_ymm3 ymm19 +#define VEC_hi_ymm4 ymm20 +#define VEC_hi_ymm5 ymm21 +#define VEC_hi_ymm6 ymm22 +#define VEC_hi_ymm7 ymm23 +#define VEC_hi_ymm8 ymm24 +#define VEC_hi_ymm9 ymm25 +#define VEC_hi_ymm10 ymm26 +#define VEC_hi_ymm11 ymm27 +#define VEC_hi_ymm12 ymm28 +#define VEC_hi_ymm13 ymm29 +#define VEC_hi_ymm14 ymm30 +#define VEC_hi_ymm15 ymm31 + +#define VEC_hi_zmm0 zmm16 +#define VEC_hi_zmm1 zmm17 +#define VEC_hi_zmm2 zmm18 +#define VEC_hi_zmm3 zmm19 +#define VEC_hi_zmm4 zmm20 +#define VEC_hi_zmm5 zmm21 +#define VEC_hi_zmm6 zmm22 +#define VEC_hi_zmm7 zmm23 +#define VEC_hi_zmm8 zmm24 +#define VEC_hi_zmm9 zmm25 +#define VEC_hi_zmm10 zmm26 +#define VEC_hi_zmm11 zmm27 +#define VEC_hi_zmm12 zmm28 +#define VEC_hi_zmm13 zmm29 +#define VEC_hi_zmm14 zmm30 +#define VEC_hi_zmm15 zmm31 + +#define PRIMITIVE_VEC(vec, num) vec##num + +#define VEC_any_xmm(i) PRIMITIVE_VEC(xmm, i) +#define VEC_any_ymm(i) PRIMITIVE_VEC(ymm, i) +#define VEC_any_zmm(i) PRIMITIVE_VEC(zmm, i) + +#define VEC_hi_xmm(i) PRIMITIVE_VEC(VEC_hi_xmm, i) +#define VEC_hi_ymm(i) PRIMITIVE_VEC(VEC_hi_ymm, i) +#define VEC_hi_zmm(i) PRIMITIVE_VEC(VEC_hi_zmm, i) + +#endif From patchwork Mon Jun 6 22:37:20 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 1639651 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: bilbo.ozlabs.org; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.a=rsa-sha256 header.s=default header.b=HSx2jh6t; dkim-atps=neutral Authentication-Results: ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=sourceware.org (client-ip=8.43.85.97; helo=sourceware.org; envelope-from=libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Received: from sourceware.org (server2.sourceware.org [8.43.85.97]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by bilbo.ozlabs.org (Postfix) with ESMTPS id 4LH7gZ63snz9sFs for ; Tue, 7 Jun 2022 08:38:34 +1000 (AEST) Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id D6A4938303E2 for ; Mon, 6 Jun 2022 22:38:31 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org D6A4938303E2 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1654555111; bh=OihTx7wocCb58SxTpZnoXms3ekfNBHSaIy//8tjU1Sc=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=HSx2jh6t8eR/Ge+7aFaxnllLW9lMJP26DnJZquvkwMpWGIp2qxc8N43fSHDZVNrJT vGDTvqRpX9Fq+CIp4oKYxXfwai7p218lQwwiaESOud6+nCls/R4NQ7udSdj9gSyMA+ 43J26xjI2sczWlkyWYNBwMA2C0iA360xtXSayViY= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pl1-x629.google.com (mail-pl1-x629.google.com [IPv6:2607:f8b0:4864:20::629]) by sourceware.org (Postfix) with ESMTPS id 27E6E3839186 for ; Mon, 6 Jun 2022 22:37:35 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 27E6E3839186 Received: by mail-pl1-x629.google.com with SMTP id i1so13272143plg.7 for ; Mon, 06 Jun 2022 15:37:35 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=OihTx7wocCb58SxTpZnoXms3ekfNBHSaIy//8tjU1Sc=; b=G5QRPlcBMINreRgJ2k0l+PDFK3IWeEY4Ki35m815wxZxCqRnr0Hiwef8J8mEPQj2dz e25sA73z21x5jwNdUCSsn7uSlAaeGcbGM+6apn8Uads2dUnaClOp3YqAC0MMMcH/+lcL zhHl+BDa16kdCNqS5C9wN6nrLyxQM5Ee/7ZxFnkr8sE9kZEVKvc+laMdo8REbH5r890h vKzpVk3wbDlEvpqln3DR4150W5t72fIer2CzIShyu5CxLcfxwhYjs7xIm/4gks3KUS/N GgEABKD6v88vzfLY1hXvMXYVNrYONTbL6UhFl3dnTz1TkUdHFZt3ydAKsZa7Pen8aSgW dXgw== X-Gm-Message-State: AOAM532D2o5Hzaic8TWhB/5OEc7Gm0TLXuSHv826koKodt9pNu2FZ9cB AjV3ucYlUAMg56oNC3Xt9SXjyM8Ue7g= X-Google-Smtp-Source: ABdhPJzuZKpU6RI90wkn/TwOeOqzSY6Ee6PsBrexG/k+t4/HbkIk3RPxyfEhdiXhJV4yVhS2xPcwgw== X-Received: by 2002:a17:902:e54b:b0:166:50b6:a0a0 with SMTP id n11-20020a170902e54b00b0016650b6a0a0mr21593730plf.30.1654555054059; Mon, 06 Jun 2022 15:37:34 -0700 (PDT) Received: from noah-tgl.. ([2600:1010:b04a:6ef:921e:3108:9361:2ef8]) by smtp.gmail.com with ESMTPSA id e22-20020a17090a4a1600b001e345c579d5sm10417532pjh.26.2022.06.06.15.37.33 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 06 Jun 2022 15:37:33 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v4 2/8] x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` Date: Mon, 6 Jun 2022 15:37:20 -0700 Message-Id: <20220606223726.2082226-2-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20220606223726.2082226-1-goldstein.w.n@gmail.com> References: <20220603044229.2180216-2-goldstein.w.n@gmail.com> <20220606223726.2082226-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE, URIBL_BLACK autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org Sender: "Libc-alpha" The RTM vzeroupper mitigation has no way of replacing inline vzeroupper not before a return. This can be useful when hoisting a vzeroupper to save code size for example: ``` L(foo): cmpl %eax, %edx jz L(bar) tzcntl %eax, %eax addq %rdi, %rax VZEROUPPER_RETURN L(bar): xorl %eax, %eax VZEROUPPER_RETURN ``` Can become: ``` L(foo): COND_VZEROUPPER cmpl %eax, %edx jz L(bar) tzcntl %eax, %eax addq %rdi, %rax ret L(bar): xorl %eax, %eax ret ``` This code does not change any existing functionality. There is no difference in the objdump of libc.so before and after this patch. Reviewed-by: H.J. Lu --- sysdeps/x86_64/multiarch/avx-rtm-vecs.h | 1 + sysdeps/x86_64/sysdep.h | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h index 3f531dd47f..6ca9f5e6ba 100644 --- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h +++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h @@ -20,6 +20,7 @@ #ifndef _AVX_RTM_VECS_H #define _AVX_RTM_VECS_H 1 +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST #define ZERO_UPPER_VEC_REGISTERS_RETURN \ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h index f14d50786d..4f512d5566 100644 --- a/sysdeps/x86_64/sysdep.h +++ b/sysdeps/x86_64/sysdep.h @@ -106,6 +106,24 @@ lose: \ vzeroupper; \ ret +/* Can be used to replace vzeroupper that is not directly before a + return. This is useful when hoisting a vzeroupper from multiple + return paths to decrease the total number of vzerouppers and code + size. */ +#define COND_VZEROUPPER_XTEST \ + xtest; \ + jz 1f; \ + vzeroall; \ + jmp 2f; \ +1: \ + vzeroupper; \ +2: + +/* In RTM define this as COND_VZEROUPPER_XTEST. */ +#ifndef COND_VZEROUPPER +# define COND_VZEROUPPER vzeroupper +#endif + /* Zero upper vector registers and return. */ #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN # define ZERO_UPPER_VEC_REGISTERS_RETURN \ From patchwork Mon Jun 6 22:37:21 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 1639652 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: bilbo.ozlabs.org; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.a=rsa-sha256 header.s=default header.b=lni9V82k; dkim-atps=neutral Authentication-Results: ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=sourceware.org (client-ip=8.43.85.97; helo=sourceware.org; envelope-from=libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Received: from sourceware.org (server2.sourceware.org [8.43.85.97]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by bilbo.ozlabs.org (Postfix) with ESMTPS id 4LH7hM6HZ7z9sFs for ; Tue, 7 Jun 2022 08:39:15 +1000 (AEST) Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id E3CF738303DF for ; Mon, 6 Jun 2022 22:39:13 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org E3CF738303DF DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1654555153; bh=WKH/FOlfnLnqzIvRcb98vzZ0YWaLb7ineNsj1p0Nkug=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=lni9V82k7EXxTwqUnM8jWo+EGmJAoFeF6EEe1wdQIxsUXTg8fMtUDUBfHGWgo0FGA 5tu79DBR3e8Z1TE1PBGziEqC0i7H9m7nVwjmrQY8/OWII/FTkTfP/KkIh+cEqeMhMz PdSg/BmlnN8rM03lsmARmMvn49F6o0SUlVy1o4z8= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pf1-x42e.google.com (mail-pf1-x42e.google.com [IPv6:2607:f8b0:4864:20::42e]) by sourceware.org (Postfix) with ESMTPS id 15CCB38387D5 for ; Mon, 6 Jun 2022 22:37:37 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 15CCB38387D5 Received: by mail-pf1-x42e.google.com with SMTP id j6so13836127pfe.13 for ; Mon, 06 Jun 2022 15:37:37 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=WKH/FOlfnLnqzIvRcb98vzZ0YWaLb7ineNsj1p0Nkug=; b=reX2J67Ys5qWJ1oJlBhTN/APKzRAScjd93J04AXVINbV+NST4+3d3hNxEaHq6Q6TP2 C/GJe1TU+Miu3QlSJaln95Q+P8UcSizNoEoQkJFrITIgtSuaLZm4iZ8cDKYNSy6AqtHq m6H/KZv10rwqRsWfs7Izk990XxdF7hLjktnfe+cyJj7UtxjjbBxNqBIBK5Jk0GZ4E62b KHHfkmWOlRfFWvt+BWL3CB4nNAf8mk7D2VPXTk53694DxqgVG5BeuH7KEuoJy5aAvxk8 1whKv65dUiX/s8e3StXfmfe/or7Zz7a+k8duCpoy3roSJfJmVCttdorQxBoPlqtmaXpr NFww== X-Gm-Message-State: AOAM533+kl8uzHFGX8W026RqQjO4n8u+NTysSxxxlOF7+f6L09ai3k4b QNuQ+5mC76VZ2Kzkk1Z3UzgcjDdVA+A= X-Google-Smtp-Source: ABdhPJyPxCgdXyf6ETP6/+3xl79RzIQze4qy9lQJTn28FLA9k9UBUo0zoHrwwAYyjKPFNnMAhu/n0w== X-Received: by 2002:a63:5518:0:b0:3fa:2f02:1a36 with SMTP id j24-20020a635518000000b003fa2f021a36mr23062835pgb.153.1654555055707; Mon, 06 Jun 2022 15:37:35 -0700 (PDT) Received: from noah-tgl.. ([2600:1010:b04a:6ef:921e:3108:9361:2ef8]) by smtp.gmail.com with ESMTPSA id e22-20020a17090a4a1600b001e345c579d5sm10417532pjh.26.2022.06.06.15.37.34 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 06 Jun 2022 15:37:35 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v4 3/8] Benchtests: Improve memrchr benchmarks Date: Mon, 6 Jun 2022 15:37:21 -0700 Message-Id: <20220606223726.2082226-3-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20220606223726.2082226-1-goldstein.w.n@gmail.com> References: <20220603044229.2180216-2-goldstein.w.n@gmail.com> <20220606223726.2082226-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.9 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org Sender: "Libc-alpha" Add a second iteration for memrchr to set `pos` starting from the end of the buffer. Previously `pos` was only set relative to the begining of the buffer. This isn't really useful for memchr because the begining of the search space is (buf + len). --- benchtests/bench-memchr.c | 110 ++++++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 45 deletions(-) diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c index 4d7212332f..0facda2fa0 100644 --- a/benchtests/bench-memchr.c +++ b/benchtests/bench-memchr.c @@ -76,7 +76,7 @@ do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s, int c, static void do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, - int seek_char) + int seek_char, int invert_pos) { size_t i; @@ -96,7 +96,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, if (pos < len) { - buf[align + pos] = seek_char; + if (invert_pos) + buf[align + len - pos] = seek_char; + else + buf[align + pos] = seek_char; buf[align + len] = -seek_char; } else @@ -109,6 +112,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, json_attr_uint (json_ctx, "pos", pos); json_attr_uint (json_ctx, "len", len); json_attr_uint (json_ctx, "seek_char", seek_char); + json_attr_uint (json_ctx, "invert_pos", invert_pos); json_array_begin (json_ctx, "timings"); @@ -123,6 +127,7 @@ int test_main (void) { size_t i; + int repeats; json_ctx_t json_ctx; test_init (); @@ -142,53 +147,68 @@ test_main (void) json_array_begin (&json_ctx, "results"); - for (i = 1; i < 8; ++i) + for (repeats = 0; repeats < 2; ++repeats) { - do_test (&json_ctx, 0, 16 << i, 2048, 23); - do_test (&json_ctx, i, 64, 256, 23); - do_test (&json_ctx, 0, 16 << i, 2048, 0); - do_test (&json_ctx, i, 64, 256, 0); - - do_test (&json_ctx, getpagesize () - 15, 64, 256, 0); + for (i = 1; i < 8; ++i) + { + do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats); + do_test (&json_ctx, i, 64, 256, 23, repeats); + do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats); + do_test (&json_ctx, i, 64, 256, 0, repeats); + + do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats); #ifdef USE_AS_MEMRCHR - /* Also test the position close to the beginning for memrchr. */ - do_test (&json_ctx, 0, i, 256, 23); - do_test (&json_ctx, 0, i, 256, 0); - do_test (&json_ctx, i, i, 256, 23); - do_test (&json_ctx, i, i, 256, 0); + /* Also test the position close to the beginning for memrchr. */ + do_test (&json_ctx, 0, i, 256, 23, repeats); + do_test (&json_ctx, 0, i, 256, 0, repeats); + do_test (&json_ctx, i, i, 256, 23, repeats); + do_test (&json_ctx, i, i, 256, 0, repeats); #endif - } - for (i = 1; i < 8; ++i) - { - do_test (&json_ctx, i, i << 5, 192, 23); - do_test (&json_ctx, i, i << 5, 192, 0); - do_test (&json_ctx, i, i << 5, 256, 23); - do_test (&json_ctx, i, i << 5, 256, 0); - do_test (&json_ctx, i, i << 5, 512, 23); - do_test (&json_ctx, i, i << 5, 512, 0); - - do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23); - } - for (i = 1; i < 32; ++i) - { - do_test (&json_ctx, 0, i, i + 1, 23); - do_test (&json_ctx, 0, i, i + 1, 0); - do_test (&json_ctx, i, i, i + 1, 23); - do_test (&json_ctx, i, i, i + 1, 0); - do_test (&json_ctx, 0, i, i - 1, 23); - do_test (&json_ctx, 0, i, i - 1, 0); - do_test (&json_ctx, i, i, i - 1, 23); - do_test (&json_ctx, i, i, i - 1, 0); - - do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23); - do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0); - - do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23); - do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0); + } + for (i = 1; i < 8; ++i) + { + do_test (&json_ctx, i, i << 5, 192, 23, repeats); + do_test (&json_ctx, i, i << 5, 192, 0, repeats); + do_test (&json_ctx, i, i << 5, 256, 23, repeats); + do_test (&json_ctx, i, i << 5, 256, 0, repeats); + do_test (&json_ctx, i, i << 5, 512, 23, repeats); + do_test (&json_ctx, i, i << 5, 512, 0, repeats); + + do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats); + } + for (i = 1; i < 32; ++i) + { + do_test (&json_ctx, 0, i, i + 1, 23, repeats); + do_test (&json_ctx, 0, i, i + 1, 0, repeats); + do_test (&json_ctx, i, i, i + 1, 23, repeats); + do_test (&json_ctx, i, i, i + 1, 0, repeats); + do_test (&json_ctx, 0, i, i - 1, 23, repeats); + do_test (&json_ctx, 0, i, i - 1, 0, repeats); + do_test (&json_ctx, i, i, i - 1, 23, repeats); + do_test (&json_ctx, i, i, i - 1, 0, repeats); + + do_test (&json_ctx, getpagesize () / 2, i, i + 1, 23, repeats); + do_test (&json_ctx, getpagesize () / 2, i, i + 1, 0, repeats); + do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 23, repeats); + do_test (&json_ctx, getpagesize () / 2 + i, i, i + 1, 0, repeats); + do_test (&json_ctx, getpagesize () / 2, i, i - 1, 23, repeats); + do_test (&json_ctx, getpagesize () / 2, i, i - 1, 0, repeats); + do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 23, repeats); + do_test (&json_ctx, getpagesize () / 2 + i, i, i - 1, 0, repeats); + + do_test (&json_ctx, getpagesize () - 15, i, i - 1, 23, repeats); + do_test (&json_ctx, getpagesize () - 15, i, i - 1, 0, repeats); + + do_test (&json_ctx, getpagesize () - 15, i, i + 1, 23, repeats); + do_test (&json_ctx, getpagesize () - 15, i, i + 1, 0, repeats); + #ifdef USE_AS_MEMRCHR - /* Also test the position close to the beginning for memrchr. */ - do_test (&json_ctx, 0, 1, i + 1, 23); - do_test (&json_ctx, 0, 2, i + 1, 0); + do_test (&json_ctx, 0, 1, i + 1, 23, repeats); + do_test (&json_ctx, 0, 2, i + 1, 0, repeats); +#endif + } +#ifndef USE_AS_MEMRCHR + break; #endif } From patchwork Mon Jun 6 22:37:22 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 1639653 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: bilbo.ozlabs.org; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.a=rsa-sha256 header.s=default header.b=fVZMnB7p; dkim-atps=neutral Authentication-Results: ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=sourceware.org (client-ip=2620:52:3:1:0:246e:9693:128c; helo=sourceware.org; envelope-from=libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Received: from sourceware.org (server2.sourceware.org [IPv6:2620:52:3:1:0:246e:9693:128c]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by bilbo.ozlabs.org (Postfix) with ESMTPS id 4LH7jH4j3yz9sFs for ; Tue, 7 Jun 2022 08:40:03 +1000 (AEST) Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id A24FF3830674 for ; Mon, 6 Jun 2022 22:40:00 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A24FF3830674 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1654555200; bh=L6oU96R7f0Tgg8soPgebUul5bGICwuQOzInTyYShe14=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=fVZMnB7pkFTB6aEiXl+D1j7ZWLaT4MIiDtEXOhBBkG8dW3c+tV1WKurrgwP59zli6 +kNmsRSU1VJ9nPapDvOo/2YdTnLiK+QQ4vGP5Espu4NA2VRZWYpIdh0CBKXHd0VwCR apQT1blEK915hLXkX6QS8JHfYZamqK43Dtk2tu5E= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pf1-x431.google.com (mail-pf1-x431.google.com [IPv6:2607:f8b0:4864:20::431]) by sourceware.org (Postfix) with ESMTPS id A8A853839C43 for ; Mon, 6 Jun 2022 22:37:38 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org A8A853839C43 Received: by mail-pf1-x431.google.com with SMTP id j6so13836198pfe.13 for ; Mon, 06 Jun 2022 15:37:38 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=L6oU96R7f0Tgg8soPgebUul5bGICwuQOzInTyYShe14=; b=0wqZxQ+Hecz+z7oKXI8oI3iSdhFt3mvuhdCXRhenToiCbyfg8gGs+AzzI5BNbpyPoe fVnEyqDzv49k3j2lpFtCkE+q4zgEytIhNC8Oqdcsr13Vsr3NRBpxdYo9GfhGAd9mc35J LV6SThkl+58SygZY+Eg1Ax70w2BBPJ/L42S0JLCjZ02VqFnyrRK9HrQLbnKI2EAIgv/v owIzxOl7W/giHgrNnb5JruqkHRbWmcODY6/3qZ0MIVdBbp3bnGPWoglWWm+D+F7u5alH VALfurF8AOtMcPalWQ/PCoP5Wzz6C1lCUxR8OuYGZ/zpsii47xAFlC3lJNddhAou9YPA Gkqg== X-Gm-Message-State: AOAM530PkC+Id0P9/Dmy+QTWGVN6c/YLw7A+F/GlCfLytPT0ejtro86b GC2xYhbtWXeJrp2+bpyICElWuDTFuS4= X-Google-Smtp-Source: ABdhPJwPqaiPFIcvin9l/XVaBITYoydDII3VD97S2b4FLqjajxOlDtVodRT3Qg11eXVazX8pNPxo1g== X-Received: by 2002:a63:1666:0:b0:3fd:a62f:94a6 with SMTP id 38-20020a631666000000b003fda62f94a6mr7963263pgw.360.1654555057173; Mon, 06 Jun 2022 15:37:37 -0700 (PDT) Received: from noah-tgl.. ([2600:1010:b04a:6ef:921e:3108:9361:2ef8]) by smtp.gmail.com with ESMTPSA id e22-20020a17090a4a1600b001e345c579d5sm10417532pjh.26.2022.06.06.15.37.36 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 06 Jun 2022 15:37:36 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v4 4/8] x86: Optimize memrchr-sse2.S Date: Mon, 6 Jun 2022 15:37:22 -0700 Message-Id: <20220606223726.2082226-4-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20220606223726.2082226-1-goldstein.w.n@gmail.com> References: <20220603044229.2180216-2-goldstein.w.n@gmail.com> <20220606223726.2082226-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.4 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, RCVD_IN_DNSWL_NONE, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org Sender: "Libc-alpha" The new code: 1. prioritizes smaller lengths more. 2. optimizes target placement more carefully. 3. reuses logic more. 4. fixes up various inefficiencies in the logic. The total code size saving is: 394 bytes Geometric Mean of all benchmarks New / Old: 0.874 Regressions: 1. The page cross case is now colder, especially re-entry from the page cross case if a match is not found in the first VEC (roughly 50%). My general opinion with this patch is this is acceptable given the "coldness" of this case (less than 4%) and generally performance improvement in the other far more common cases. 2. There are some regressions 5-15% for medium/large user-arg lengths that have a match in the first VEC. This is because the logic was rewritten to optimize finds in the first VEC if the user-arg length is shorter (where we see roughly 20-50% performance improvements). It is not always the case this is a regression. My intuition is some frontend quirk is partially explaining the data although I haven't been able to find the root cause. Full xcheck passes on x86_64. --- sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++-------------------- 1 file changed, 292 insertions(+), 321 deletions(-) diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S index d1a9f47911..b0dffd2ae2 100644 --- a/sysdeps/x86_64/memrchr.S +++ b/sysdeps/x86_64/memrchr.S @@ -18,362 +18,333 @@ . */ #include +#define VEC_SIZE 16 +#define PAGE_SIZE 4096 .text -ENTRY (__memrchr) - movd %esi, %xmm1 - - sub $16, %RDX_LP - jbe L(length_less16) - - punpcklbw %xmm1, %xmm1 - punpcklbw %xmm1, %xmm1 - - add %RDX_LP, %RDI_LP - pshufd $0, %xmm1, %xmm1 - - movdqu (%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - -/* Check if there is a match. */ - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches0) - - sub $64, %rdi - mov %edi, %ecx - and $15, %ecx - jz L(loop_prolog) - - add $16, %rdi - add $16, %rdx - and $-16, %rdi - sub %rcx, %rdx - - .p2align 4 -L(loop_prolog): - sub $64, %rdx - jbe L(exit_loop) - - movdqa 48(%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16) - - movdqa (%rdi), %xmm4 - pcmpeqb %xmm1, %xmm4 - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches0) - - sub $64, %rdi - sub $64, %rdx - jbe L(exit_loop) - - movdqa 48(%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16) - - movdqa (%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches0) - - mov %edi, %ecx - and $63, %ecx - jz L(align64_loop) - - add $64, %rdi - add $64, %rdx - and $-64, %rdi - sub %rcx, %rdx - - .p2align 4 -L(align64_loop): - sub $64, %rdi - sub $64, %rdx - jbe L(exit_loop) - - movdqa (%rdi), %xmm0 - movdqa 16(%rdi), %xmm2 - movdqa 32(%rdi), %xmm3 - movdqa 48(%rdi), %xmm4 - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm1, %xmm2 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm1, %xmm4 - - pmaxub %xmm3, %xmm0 - pmaxub %xmm4, %xmm2 - pmaxub %xmm0, %xmm2 - pmovmskb %xmm2, %eax - - test %eax, %eax - jz L(align64_loop) - - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches48) - - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%rdi), %xmm2 - - pcmpeqb %xmm1, %xmm2 - pcmpeqb (%rdi), %xmm1 - - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - pmovmskb %xmm1, %eax - bsr %eax, %eax - - add %rdi, %rax +ENTRY_P2ALIGN(__memrchr, 6) +#ifdef __ILP32__ + /* Clear upper bits. */ + mov %RDX_LP, %RDX_LP +#endif + movd %esi, %xmm0 + + /* Get end pointer. */ + leaq (%rdx, %rdi), %rcx + + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0 + + /* Check if we can load 1x VEC without cross a page. */ + testl $(PAGE_SIZE - VEC_SIZE), %ecx + jz L(page_cross) + + /* NB: This load happens regardless of whether rdx (len) is zero. Since + it doesn't cross a page and the standard gurantees any pointer have + at least one-valid byte this load must be safe. For the entire + history of the x86 memrchr implementation this has been possible so + no code "should" be relying on a zero-length check before this load. + The zero-length check is moved to the page cross case because it is + 1) pretty cold and including it pushes the hot case len <= VEC_SIZE + into 2-cache lines. */ + movups -(VEC_SIZE)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + subq $VEC_SIZE, %rdx + ja L(more_1x_vec) +L(ret_vec_x0_test): + /* Zero-flag set if eax (src) is zero. Destination unchanged if src is + zero. */ + bsrl %eax, %eax + jz L(ret_0) + /* Check if the CHAR match is in bounds. Need to truly zero `eax` here + if out of bounds. */ + addl %edx, %eax + jl L(zero_0) + /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base + ptr. */ + addq %rdi, %rax +L(ret_0): ret - .p2align 4 -L(exit_loop): - add $64, %edx - cmp $32, %edx - jbe L(exit_loop_32) - - movdqa 48(%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%rdi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16_1) - cmp $48, %edx - jbe L(return_null) - - pcmpeqb (%rdi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches0_1) - xor %eax, %eax + .p2align 4,, 5 +L(ret_vec_x0): + bsrl %eax, %eax + leaq -(VEC_SIZE)(%rcx, %rax), %rax ret - .p2align 4 -L(exit_loop_32): - movdqa 48(%rdi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48_1) - cmp $16, %edx - jbe L(return_null) - - pcmpeqb 32(%rdi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches32_1) - xor %eax, %eax + .p2align 4,, 2 +L(zero_0): + xorl %eax, %eax ret - .p2align 4 -L(matches0): - bsr %eax, %eax - add %rdi, %rax - ret - - .p2align 4 -L(matches16): - bsr %eax, %eax - lea 16(%rax, %rdi), %rax - ret - .p2align 4 -L(matches32): - bsr %eax, %eax - lea 32(%rax, %rdi), %rax + .p2align 4,, 8 +L(more_1x_vec): + testl %eax, %eax + jnz L(ret_vec_x0) + + /* Align rcx (pointer to string). */ + decq %rcx + andq $-VEC_SIZE, %rcx + + movq %rcx, %rdx + /* NB: We could consistenyl save 1-byte in this pattern with `movaps + %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is + it adds more frontend uops (even if the moves can be eliminated) and + some percentage of the time actual backend uops. */ + movaps -(VEC_SIZE)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + subq %rdi, %rdx + pmovmskb %xmm1, %eax + + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) +L(last_2x_vec): + subl $VEC_SIZE, %edx + jbe L(ret_vec_x0_test) + + testl %eax, %eax + jnz L(ret_vec_x0) + + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + subl $VEC_SIZE, %edx + bsrl %eax, %eax + jz L(ret_1) + addl %edx, %eax + jl L(zero_0) + addq %rdi, %rax +L(ret_1): ret - .p2align 4 -L(matches48): - bsr %eax, %eax - lea 48(%rax, %rdi), %rax + /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross) + causes the hot pause (length <= VEC_SIZE) to span multiple cache + lines. Naturally aligned % 16 to 8-bytes. */ +L(page_cross): + /* Zero length check. */ + testq %rdx, %rdx + jz L(zero_0) + + leaq -1(%rcx), %r8 + andq $-(VEC_SIZE), %r8 + + movaps (%r8), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + /* Shift out negative alignment (because we are starting from endptr and + working backwards). */ + negl %ecx + /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count + explicitly. */ + andl $(VEC_SIZE - 1), %ecx + shl %cl, %esi + movzwl %si, %eax + leaq (%rdi, %rdx), %rcx + cmpq %rdi, %r8 + ja L(more_1x_vec) + subl $VEC_SIZE, %edx + bsrl %eax, %eax + jz L(ret_2) + addl %edx, %eax + jl L(zero_1) + addq %rdi, %rax +L(ret_2): ret - .p2align 4 -L(matches0_1): - bsr %eax, %eax - sub $64, %rdx - add %rax, %rdx - jl L(return_null) - add %rdi, %rax + /* Fits in aliging bytes. */ +L(zero_1): + xorl %eax, %eax ret - .p2align 4 -L(matches16_1): - bsr %eax, %eax - sub $48, %rdx - add %rax, %rdx - jl L(return_null) - lea 16(%rdi, %rax), %rax + .p2align 4,, 5 +L(ret_vec_x1): + bsrl %eax, %eax + leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax ret - .p2align 4 -L(matches32_1): - bsr %eax, %eax - sub $32, %rdx - add %rax, %rdx - jl L(return_null) - lea 32(%rdi, %rax), %rax - ret + .p2align 4,, 8 +L(more_2x_vec): + testl %eax, %eax + jnz L(ret_vec_x0) - .p2align 4 -L(matches48_1): - bsr %eax, %eax - sub $16, %rdx - add %rax, %rdx - jl L(return_null) - lea 48(%rdi, %rax), %rax - ret + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + testl %eax, %eax + jnz L(ret_vec_x1) - .p2align 4 -L(return_null): - xor %eax, %eax - ret - .p2align 4 -L(length_less16_offset0): - test %edx, %edx - jz L(return_null) + movaps -(VEC_SIZE * 3)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax - mov %dl, %cl - pcmpeqb (%rdi), %xmm1 + subq $(VEC_SIZE * 4), %rdx + ja L(more_4x_vec) - mov $1, %edx - sal %cl, %edx - sub $1, %edx + addl $(VEC_SIZE), %edx + jle L(ret_vec_x2_test) - pmovmskb %xmm1, %eax +L(last_vec): + testl %eax, %eax + jnz L(ret_vec_x2) - and %edx, %eax - test %eax, %eax - jz L(return_null) + movaps -(VEC_SIZE * 4)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax - bsr %eax, %eax - add %rdi, %rax + subl $(VEC_SIZE), %edx + bsrl %eax, %eax + jz L(ret_3) + addl %edx, %eax + jl L(zero_2) + addq %rdi, %rax +L(ret_3): ret - .p2align 4 -L(length_less16): - punpcklbw %xmm1, %xmm1 - punpcklbw %xmm1, %xmm1 - - add $16, %edx - - pshufd $0, %xmm1, %xmm1 - - mov %edi, %ecx - and $15, %ecx - jz L(length_less16_offset0) - - mov %cl, %dh - mov %ecx, %esi - add %dl, %dh - and $-16, %rdi - - sub $16, %dh - ja L(length_less16_part2) - - pcmpeqb (%rdi), %xmm1 - pmovmskb %xmm1, %eax - - sar %cl, %eax - mov %dl, %cl - - mov $1, %edx - sal %cl, %edx - sub $1, %edx - - and %edx, %eax - test %eax, %eax - jz L(return_null) - - bsr %eax, %eax - add %rdi, %rax - add %rsi, %rax + .p2align 4,, 6 +L(ret_vec_x2_test): + bsrl %eax, %eax + jz L(zero_2) + addl %edx, %eax + jl L(zero_2) + addq %rdi, %rax ret - .p2align 4 -L(length_less16_part2): - movdqa 16(%rdi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - - mov %dh, %cl - mov $1, %edx - sal %cl, %edx - sub $1, %edx - - and %edx, %eax +L(zero_2): + xorl %eax, %eax + ret - test %eax, %eax - jnz L(length_less16_part2_return) - pcmpeqb (%rdi), %xmm1 - pmovmskb %xmm1, %eax + .p2align 4,, 5 +L(ret_vec_x2): + bsrl %eax, %eax + leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax + ret - mov %esi, %ecx - sar %cl, %eax - test %eax, %eax - jz L(return_null) + .p2align 4,, 5 +L(ret_vec_x3): + bsrl %eax, %eax + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax + ret - bsr %eax, %eax - add %rdi, %rax - add %rsi, %rax + .p2align 4,, 8 +L(more_4x_vec): + testl %eax, %eax + jnz L(ret_vec_x2) + + movaps -(VEC_SIZE * 4)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + testl %eax, %eax + jnz L(ret_vec_x3) + + addq $-(VEC_SIZE * 4), %rcx + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec) + + /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end + keeping the code from spilling to the next cache line. */ + addq $(VEC_SIZE * 4 - 1), %rcx + andq $-(VEC_SIZE * 4), %rcx + leaq (VEC_SIZE * 4)(%rdi), %rdx + andq $-(VEC_SIZE * 4), %rdx + + .p2align 4,, 11 +L(loop_4x_vec): + movaps (VEC_SIZE * -1)(%rcx), %xmm1 + movaps (VEC_SIZE * -2)(%rcx), %xmm2 + movaps (VEC_SIZE * -3)(%rcx), %xmm3 + movaps (VEC_SIZE * -4)(%rcx), %xmm4 + pcmpeqb %xmm0, %xmm1 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm0, %xmm3 + pcmpeqb %xmm0, %xmm4 + + por %xmm1, %xmm2 + por %xmm3, %xmm4 + por %xmm2, %xmm4 + + pmovmskb %xmm4, %esi + testl %esi, %esi + jnz L(loop_end) + + addq $-(VEC_SIZE * 4), %rcx + cmpq %rdx, %rcx + jne L(loop_4x_vec) + + subl %edi, %edx + + /* Ends up being 1-byte nop. */ + .p2align 4,, 2 +L(last_4x_vec): + movaps -(VEC_SIZE)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) + + testl %eax, %eax + jnz L(ret_vec_x0) + + + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + testl %eax, %eax + jnz L(ret_vec_end) + + movaps -(VEC_SIZE * 3)(%rcx), %xmm1 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %eax + + subl $(VEC_SIZE * 3), %edx + ja L(last_vec) + bsrl %eax, %eax + jz L(ret_4) + addl %edx, %eax + jl L(zero_3) + addq %rdi, %rax +L(ret_4): ret - .p2align 4 -L(length_less16_part2_return): - bsr %eax, %eax - lea 16(%rax, %rdi), %rax + /* Ends up being 1-byte nop. */ + .p2align 4,, 3 +L(loop_end): + pmovmskb %xmm1, %eax + sall $16, %eax + jnz L(ret_vec_end) + + pmovmskb %xmm2, %eax + testl %eax, %eax + jnz L(ret_vec_end) + + pmovmskb %xmm3, %eax + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) + then it won't affect the result in esi (VEC4). If ecx is non-zero + then CHAR in VEC3 and bsrq will use that position. */ + sall $16, %eax + orl %esi, %eax + bsrl %eax, %eax + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax ret -END (__memrchr) +L(ret_vec_end): + bsrl %eax, %eax + leaq (VEC_SIZE * -2)(%rax, %rcx), %rax + ret + /* Use in L(last_4x_vec). In the same cache line. This is just a spare + aligning bytes. */ +L(zero_3): + xorl %eax, %eax + ret + /* 2-bytes from next cache line. */ +END(__memrchr) weak_alias (__memrchr, memrchr) From patchwork Mon Jun 6 22:37:23 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 1639654 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: bilbo.ozlabs.org; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.a=rsa-sha256 header.s=default header.b=FOXaqNAf; dkim-atps=neutral Authentication-Results: ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=sourceware.org (client-ip=8.43.85.97; helo=sourceware.org; envelope-from=libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Received: from sourceware.org (server2.sourceware.org [8.43.85.97]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by bilbo.ozlabs.org (Postfix) with ESMTPS id 4LH7k44QvGz9sFs for ; Tue, 7 Jun 2022 08:40:44 +1000 (AEST) Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id AE50E3839C4C for ; Mon, 6 Jun 2022 22:40:42 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org AE50E3839C4C DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1654555242; bh=LMyuqEInbSGY1eeIKObQ8iv7+j6ynH73haXaaU6dweg=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=FOXaqNAfiTIZCUqcziWt3gwKrHIIyWgzz4ROCdHuyrxkgUT+RvvwNWV3EjSKJq+ao IwF1gd9CeCSWhdBt+AA44j0EdPre+c9i2oMthwbPSlU2ECvNUKJb+eicNAILqtvb/f 3fmu4GwoTkgqkhNBGPlEa4TDe68Zf5pje+YAbbFo= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pg1-x52d.google.com (mail-pg1-x52d.google.com [IPv6:2607:f8b0:4864:20::52d]) by sourceware.org (Postfix) with ESMTPS id 2F42938303D5 for ; Mon, 6 Jun 2022 22:37:40 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 2F42938303D5 Received: by mail-pg1-x52d.google.com with SMTP id c18so5797369pgh.11 for ; Mon, 06 Jun 2022 15:37:40 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=LMyuqEInbSGY1eeIKObQ8iv7+j6ynH73haXaaU6dweg=; b=LkQGczX2nXQIs0vWqqL+0mDupQ/8DZVIPA/9y/w/mRG59n9aZ4T/Be+7sVUsRUZox5 IpUoJRAOPGhRsiEpy5PvvzXfRv9de1AiirOnY35Q4+T7n4b2rrbQtHWt0lHGtwZaQepi IOxVs8aiLk1nY9dfiTqyePXuoX4EvMXbvsBEIQQ8vEDB/O+7UBiCZGeXdvsJTukYHd0Z cJRQQT/XYr9+z416xwP3frYPJ0/IER0NCzUL4NchFOn3X3V//3bXnVYGqa8fL1z5axw2 SzzSYErc7DQgSS3gyV0nXu+abhbhaQhdvJ0PWjfjR8WKgHl/J/IMp15qOwVOR/2/SywR bexg== X-Gm-Message-State: AOAM5304UjndrcXfODrmgadJkud2P4KTlfDBdilUrQFos3U4VvkY6e0s 2BfTJ5Z8wM/u/vXU4Q2CaIK8FAaH9Es= X-Google-Smtp-Source: ABdhPJx5TAEcKi+h7W27F9Dw2xaoTvbq4NCCHe4g/8Y1HUx0W88tU8tLE1eiTZAjcvcMun5MBD5zog== X-Received: by 2002:a05:6a00:3491:b0:51c:1d3b:b0b0 with SMTP id cp17-20020a056a00349100b0051c1d3bb0b0mr6985647pfb.68.1654555058694; Mon, 06 Jun 2022 15:37:38 -0700 (PDT) Received: from noah-tgl.. ([2600:1010:b04a:6ef:921e:3108:9361:2ef8]) by smtp.gmail.com with ESMTPSA id e22-20020a17090a4a1600b001e345c579d5sm10417532pjh.26.2022.06.06.15.37.37 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 06 Jun 2022 15:37:38 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v4 5/8] x86: Optimize memrchr-evex.S Date: Mon, 6 Jun 2022 15:37:23 -0700 Message-Id: <20220606223726.2082226-5-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20220606223726.2082226-1-goldstein.w.n@gmail.com> References: <20220603044229.2180216-2-goldstein.w.n@gmail.com> <20220606223726.2082226-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.9 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org Sender: "Libc-alpha" The new code: 1. prioritizes smaller user-arg lengths more. 2. optimizes target placement more carefully 3. reuses logic more 4. fixes up various inefficiencies in the logic. The biggest case here is the `lzcnt` logic for checking returns which saves either a branch or multiple instructions. The total code size saving is: 263 bytes Geometric Mean of all benchmarks New / Old: 0.755 Regressions: There are some regressions. Particularly where the length (user arg length) is large but the position of the match char is near the begining of the string (in first VEC). This case has roughly a 20% regression. This is because the new logic gives the hot path for immediate matches to shorter lengths (the more common input). This case has roughly a 35% speedup. Full xcheck passes on x86_64. --- sysdeps/x86_64/multiarch/memrchr-evex.S | 539 ++++++++++++------------ 1 file changed, 268 insertions(+), 271 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S index 0b99709c6b..ad541c0e50 100644 --- a/sysdeps/x86_64/multiarch/memrchr-evex.S +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S @@ -19,319 +19,316 @@ #if IS_IN (libc) # include +# include "evex256-vecs.h" +# if VEC_SIZE != 32 +# error "VEC_SIZE != 32 unimplemented" +# endif + +# ifndef MEMRCHR +# define MEMRCHR __memrchr_evex +# endif + +# define PAGE_SIZE 4096 +# define VECMATCH VEC(0) + + .section SECTION(.text), "ax", @progbits +ENTRY_P2ALIGN(MEMRCHR, 6) +# ifdef __ILP32__ + /* Clear upper bits. */ + and %RDX_LP, %RDX_LP +# else + test %RDX_LP, %RDX_LP +# endif + jz L(zero_0) + + /* Get end pointer. Minus one for two reasons. 1) It is necessary for a + correct page cross check and 2) it correctly sets up end ptr to be + subtract by lzcnt aligned. */ + leaq -1(%rdi, %rdx), %rax + vpbroadcastb %esi, %VECMATCH + + /* Check if we can load 1x VEC without cross a page. */ + testl $(PAGE_SIZE - VEC_SIZE), %eax + jz L(page_cross) + + /* Don't use rax for pointer here because EVEX has better encoding with + offset % VEC_SIZE == 0. */ + vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0 + kmovd %k0, %ecx + + /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */ + cmpq $VEC_SIZE, %rdx + ja L(more_1x_vec) +L(ret_vec_x0_test): + + /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which + will gurantee edx (len) is less than it. */ + lzcntl %ecx, %ecx + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax + ret -# define VMOVA vmovdqa64 - -# define YMMMATCH ymm16 - -# define VEC_SIZE 32 - - .section .text.evex,"ax",@progbits -ENTRY (__memrchr_evex) - /* Broadcast CHAR to YMMMATCH. */ - vpbroadcastb %esi, %YMMMATCH - - sub $VEC_SIZE, %RDX_LP - jbe L(last_vec_or_less) - - add %RDX_LP, %RDI_LP - - /* Check the last VEC_SIZE bytes. */ - vpcmpb $0, (%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax - jnz L(last_vec_x0) - - subq $(VEC_SIZE * 4), %rdi - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx - jz L(aligned_more) - - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - addq $VEC_SIZE, %rdx - andq $-VEC_SIZE, %rdi - subq %rcx, %rdx - - .p2align 4 -L(aligned_more): - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) - - /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ - vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 - kmovd %k2, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 - kmovd %k3, %eax - testl %eax, %eax - jnz L(last_vec_x1) - - vpcmpb $0, (%rdi), %YMMMATCH, %k4 - kmovd %k4, %eax - testl %eax, %eax - jnz L(last_vec_x0) - - /* Align data to 4 * VEC_SIZE for loop with fewer branches. - There are some overlaps with above if data isn't aligned - to 4 * VEC_SIZE. */ - movl %edi, %ecx - andl $(VEC_SIZE * 4 - 1), %ecx - jz L(loop_4x_vec) - - addq $(VEC_SIZE * 4), %rdi - addq $(VEC_SIZE * 4), %rdx - andq $-(VEC_SIZE * 4), %rdi - subq %rcx, %rdx + /* Fits in aligning bytes of first cache line. */ +L(zero_0): + xorl %eax, %eax + ret - .p2align 4 -L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - subq $(VEC_SIZE * 4), %rdi - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) - - vpcmpb $0, (%rdi), %YMMMATCH, %k1 - vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 - kord %k1, %k2, %k5 - vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 - vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 - - kord %k3, %k4, %k6 - kortestd %k5, %k6 - jz L(loop_4x_vec) - - /* There is a match. */ - kmovd %k4, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - kmovd %k3, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - kmovd %k2, %eax - testl %eax, %eax - jnz L(last_vec_x1) - - kmovd %k1, %eax - bsrl %eax, %eax - addq %rdi, %rax + .p2align 4,, 9 +L(ret_vec_x0_dec): + decq %rax +L(ret_vec_x0): + lzcntl %ecx, %ecx + subq %rcx, %rax ret - .p2align 4 -L(last_4x_vec_or_less): - addl $(VEC_SIZE * 4), %edx - cmpl $(VEC_SIZE * 2), %edx - jbe L(last_2x_vec) + .p2align 4,, 10 +L(more_1x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x0) - vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax - jnz L(last_vec_x3) + /* Align rax (pointer to string). */ + andq $-VEC_SIZE, %rax - vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 - kmovd %k2, %eax - testl %eax, %eax - jnz L(last_vec_x2) + /* Recompute length after aligning. */ + movq %rax, %rdx - vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 - kmovd %k3, %eax - testl %eax, %eax - jnz L(last_vec_x1_check) - cmpl $(VEC_SIZE * 3), %edx - jbe L(zero) + /* Need no matter what. */ + vpcmpb $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx - vpcmpb $0, (%rdi), %YMMMATCH, %k4 - kmovd %k4, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - subq $(VEC_SIZE * 4), %rdx - addq %rax, %rdx - jl L(zero) - addq %rdi, %rax - ret + subq %rdi, %rdx - .p2align 4 + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) L(last_2x_vec): - vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax - jnz L(last_vec_x3_check) + + /* Must dec rax because L(ret_vec_x0_test) expects it. */ + decq %rax cmpl $VEC_SIZE, %edx - jbe L(zero) - - vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - subq $(VEC_SIZE * 2), %rdx - addq %rax, %rdx - jl L(zero) - addl $(VEC_SIZE * 2), %eax - addq %rdi, %rax + jbe L(ret_vec_x0_test) + + testl %ecx, %ecx + jnz L(ret_vec_x0) + + /* Don't use rax for pointer here because EVEX has better encoding with + offset % VEC_SIZE == 0. */ + vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0 + kmovd %k0, %ecx + /* NB: 64-bit lzcnt. This will naturally add 32 to position. */ + lzcntq %rcx, %rcx + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax ret - .p2align 4 -L(last_vec_x0): - bsrl %eax, %eax - addq %rdi, %rax + /* Inexpensive place to put this regarding code size / target alignments + / ICache NLP. Necessary for 2-byte encoding of jump to page cross + case which inturn in necessray for hot path (len <= VEC_SIZE) to fit + in first cache line. */ +L(page_cross): + movq %rax, %rsi + andq $-VEC_SIZE, %rsi + vpcmpb $0, (%rsi), %VECMATCH, %k0 + kmovd %k0, %r8d + /* Shift out negative alignment (because we are starting from endptr and + working backwards). */ + movl %eax, %ecx + /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ + notl %ecx + shlxl %ecx, %r8d, %ecx + cmpq %rdi, %rsi + ja L(more_1x_vec) + lzcntl %ecx, %ecx + cmpl %ecx, %edx + jle L(zero_1) + subq %rcx, %rax ret - .p2align 4 -L(last_vec_x1): - bsrl %eax, %eax - addl $VEC_SIZE, %eax - addq %rdi, %rax + /* Continue creating zero labels that fit in aligning bytes and get + 2-byte encoding / are in the same cache line as condition. */ +L(zero_1): + xorl %eax, %eax ret - .p2align 4 -L(last_vec_x2): - bsrl %eax, %eax - addl $(VEC_SIZE * 2), %eax - addq %rdi, %rax + .p2align 4,, 8 +L(ret_vec_x1): + /* This will naturally add 32 to position. */ + bsrl %ecx, %ecx + leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax ret - .p2align 4 -L(last_vec_x3): - bsrl %eax, %eax - addl $(VEC_SIZE * 3), %eax - addq %rdi, %rax - ret + .p2align 4,, 8 +L(more_2x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x0_dec) - .p2align 4 -L(last_vec_x1_check): - bsrl %eax, %eax - subq $(VEC_SIZE * 3), %rdx - addq %rax, %rdx - jl L(zero) - addl $VEC_SIZE, %eax - addq %rdi, %rax - ret + vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1) - .p2align 4 -L(last_vec_x3_check): - bsrl %eax, %eax - subq $VEC_SIZE, %rdx - addq %rax, %rdx - jl L(zero) - addl $(VEC_SIZE * 3), %eax - addq %rdi, %rax - ret + /* Need no matter what. */ + vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx - .p2align 4 -L(zero): - xorl %eax, %eax + subq $(VEC_SIZE * 4), %rdx + ja L(more_4x_vec) + + cmpl $(VEC_SIZE * -1), %edx + jle L(ret_vec_x2_test) +L(last_vec): + testl %ecx, %ecx + jnz L(ret_vec_x2) + + + /* Need no matter what. */ + vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 3 + 1), %rax + subq %rcx, %rax + cmpq %rax, %rdi + ja L(zero_1) ret - .p2align 4 -L(last_vec_or_less_aligned): - movl %edx, %ecx - - vpcmpb $0, (%rdi), %YMMMATCH, %k1 - - movl $1, %edx - /* Support rdx << 32. */ - salq %cl, %rdx - subq $1, %rdx - - kmovd %k1, %eax - - /* Remove the trailing bytes. */ - andl %edx, %eax - testl %eax, %eax - jz L(zero) - - bsrl %eax, %eax - addq %rdi, %rax + .p2align 4,, 8 +L(ret_vec_x2_test): + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 2 + 1), %rax + subq %rcx, %rax + cmpq %rax, %rdi + ja L(zero_1) ret - .p2align 4 -L(last_vec_or_less): - addl $VEC_SIZE, %edx - - /* Check for zero length. */ - testl %edx, %edx - jz L(zero) - - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx - jz L(last_vec_or_less_aligned) - - movl %ecx, %esi - movl %ecx, %r8d - addl %edx, %esi - andq $-VEC_SIZE, %rdi + .p2align 4,, 8 +L(ret_vec_x2): + bsrl %ecx, %ecx + leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax + ret - subl $VEC_SIZE, %esi - ja L(last_vec_2x_aligned) + .p2align 4,, 8 +L(ret_vec_x3): + bsrl %ecx, %ecx + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax + ret - /* Check the last VEC. */ - vpcmpb $0, (%rdi), %YMMMATCH, %k1 - kmovd %k1, %eax + .p2align 4,, 8 +L(more_4x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x2) - /* Remove the leading and trailing bytes. */ - sarl %cl, %eax - movl %edx, %ecx + vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx - movl $1, %edx - sall %cl, %edx - subl $1, %edx + testl %ecx, %ecx + jnz L(ret_vec_x3) - andl %edx, %eax - testl %eax, %eax - jz L(zero) + /* Check if near end before re-aligning (otherwise might do an + unnecissary loop iteration). */ + addq $-(VEC_SIZE * 4), %rax + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec) - bsrl %eax, %eax - addq %rdi, %rax - addq %r8, %rax - ret + decq %rax + andq $-(VEC_SIZE * 4), %rax + movq %rdi, %rdx + /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because + lengths that overflow can be valid and break the comparison. */ + andq $-(VEC_SIZE * 4), %rdx .p2align 4 -L(last_vec_2x_aligned): - movl %esi, %ecx - - /* Check the last VEC. */ - vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 +L(loop_4x_vec): + /* Store 1 were not-equals and 0 where equals in k1 (used to mask later + on). */ + vpcmpb $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1 + + /* VEC(2/3) will have zero-byte where we found a CHAR. */ + vpxorq (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2) + vpxorq (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3) + vpcmpb $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4 + + /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where + CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */ + vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z} + vptestnmb %VEC(3), %VEC(3), %k2 + + /* Any 1s and we found CHAR. */ + kortestd %k2, %k4 + jnz L(loop_end) + + addq $-(VEC_SIZE * 4), %rax + cmpq %rdx, %rax + jne L(loop_4x_vec) + + /* Need to re-adjust rdx / rax for L(last_4x_vec). */ + subq $-(VEC_SIZE * 4), %rdx + movq %rdx, %rax + subl %edi, %edx +L(last_4x_vec): + + /* Used no matter what. */ + vpcmpb $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx - movl $1, %edx - sall %cl, %edx - subl $1, %edx + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) - kmovd %k1, %eax + testl %ecx, %ecx + jnz L(ret_vec_x0_dec) - /* Remove the trailing bytes. */ - andl %edx, %eax - testl %eax, %eax - jnz L(last_vec_x1) + vpcmpb $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx - /* Check the second last VEC. */ - vpcmpb $0, (%rdi), %YMMMATCH, %k1 + testl %ecx, %ecx + jnz L(ret_vec_x1) - movl %r8d, %ecx + /* Used no matter what. */ + vpcmpb $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0 + kmovd %k0, %ecx - kmovd %k1, %eax + cmpl $(VEC_SIZE * 3), %edx + ja L(last_vec) - /* Remove the leading bytes. Must use unsigned right shift for - bsrl below. */ - shrl %cl, %eax - testl %eax, %eax - jz L(zero) + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 2 + 1), %rax + subq %rcx, %rax + cmpq %rax, %rdi + jbe L(ret_1) + xorl %eax, %eax +L(ret_1): + ret - bsrl %eax, %eax - addq %rdi, %rax - addq %r8, %rax + .p2align 4,, 6 +L(loop_end): + kmovd %k1, %ecx + notl %ecx + testl %ecx, %ecx + jnz L(ret_vec_x0_end) + + vptestnmb %VEC(2), %VEC(2), %k0 + kmovd %k0, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1_end) + + kmovd %k2, %ecx + kmovd %k4, %esi + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) + then it won't affect the result in esi (VEC4). If ecx is non-zero + then CHAR in VEC3 and bsrq will use that position. */ + salq $32, %rcx + orq %rsi, %rcx + bsrq %rcx, %rcx + addq %rcx, %rax + ret + .p2align 4,, 4 +L(ret_vec_x0_end): + addq $(VEC_SIZE), %rax +L(ret_vec_x1_end): + bsrl %ecx, %ecx + leaq (VEC_SIZE * 2)(%rax, %rcx), %rax ret -END (__memrchr_evex) + +END(MEMRCHR) #endif From patchwork Mon Jun 6 22:37:24 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 1639656 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: bilbo.ozlabs.org; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.a=rsa-sha256 header.s=default header.b=uA6GfD7w; dkim-atps=neutral Authentication-Results: ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=sourceware.org (client-ip=8.43.85.97; helo=sourceware.org; envelope-from=libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Received: from sourceware.org (server2.sourceware.org [8.43.85.97]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by bilbo.ozlabs.org (Postfix) with ESMTPS id 4LH7lp23dcz9sFs for ; Tue, 7 Jun 2022 08:42:14 +1000 (AEST) Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 6493E38303F4 for ; Mon, 6 Jun 2022 22:42:12 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 6493E38303F4 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1654555332; bh=jtptvUOSsl5wlnSZev5tr343JtDVnp1NjUalqQNa8v0=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=uA6GfD7wk48cfXyHBOsrgEbRzkFUl8JDYnrEMFf/n30vT/vAu1+7PJsglQseUMcCq 4ihj2KDc9jFFeJ+4d57ahaCm34H9tDUHh8Hd4UbriOl/kpo2Ljnvs7aprw/JoGDlOS SBvj0ufdKyY8C6YmUR0eNIaro332q4OvXgXNTVwQ= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pf1-x436.google.com (mail-pf1-x436.google.com [IPv6:2607:f8b0:4864:20::436]) by sourceware.org (Postfix) with ESMTPS id B1B1938356B1 for ; Mon, 6 Jun 2022 22:37:41 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org B1B1938356B1 Received: by mail-pf1-x436.google.com with SMTP id p8so13866252pfh.8 for ; Mon, 06 Jun 2022 15:37:41 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=jtptvUOSsl5wlnSZev5tr343JtDVnp1NjUalqQNa8v0=; b=LVqokTAuR0U/P2gwVKXDfLA2y1wBuR5gl4w1ZbV/1Pk55RpZirINO7tGRlllwKLIha yeOVhNs3tizrO+6BMhM/C8s/naqfWjb+d4xiz+QvjZkq0CHfMdIuW/3Cey5jwGNQ+OFE mMMSjxr6cpskmwNnZSEeCdE6OwmXnfa093aRDmn1J3JF/ehqGU3ZAfDIACB+yKKCcq0E D+cby5tNowCfOrw/Z06cYfRUweK7QRLbA+Ip4k53AfRNvV3ti/vFQea4AH25CXllf4pp ksD/fQ3Gd9UUfIRF+/4BDDvRgLinXB0mNbMCbRuJewBU9H8yRo8Eqd/2lT7jiK0DBXHe aL5g== X-Gm-Message-State: AOAM531rQDTQnZ3dVouDB9yAi+GHOwDkyaN5zKViCv2xzcuKdoXW8TdA 3RExGA4CBA7razpzO0Qd1guzVc2w7h4= X-Google-Smtp-Source: ABdhPJxNHZ5/Do/Lp9/dgfJ7pUICa5orO4BuTz79ln233VDMwFjAuVubTDuHiYIRafL7xiIWHQoXMg== X-Received: by 2002:a63:1653:0:b0:3fd:cbe2:c031 with SMTP id 19-20020a631653000000b003fdcbe2c031mr5432865pgw.398.1654555060083; Mon, 06 Jun 2022 15:37:40 -0700 (PDT) Received: from noah-tgl.. ([2600:1010:b04a:6ef:921e:3108:9361:2ef8]) by smtp.gmail.com with ESMTPSA id e22-20020a17090a4a1600b001e345c579d5sm10417532pjh.26.2022.06.06.15.37.39 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 06 Jun 2022 15:37:39 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v4 6/8] x86: Optimize memrchr-avx2.S Date: Mon, 6 Jun 2022 15:37:24 -0700 Message-Id: <20220606223726.2082226-6-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20220606223726.2082226-1-goldstein.w.n@gmail.com> References: <20220603044229.2180216-2-goldstein.w.n@gmail.com> <20220606223726.2082226-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.9 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org Sender: "Libc-alpha" The new code: 1. prioritizes smaller user-arg lengths more. 2. optimizes target placement more carefully 3. reuses logic more 4. fixes up various inefficiencies in the logic. The biggest case here is the `lzcnt` logic for checking returns which saves either a branch or multiple instructions. The total code size saving is: 306 bytes Geometric Mean of all benchmarks New / Old: 0.760 Regressions: There are some regressions. Particularly where the length (user arg length) is large but the position of the match char is near the begining of the string (in first VEC). This case has roughly a 10-20% regression. This is because the new logic gives the hot path for immediate matches to shorter lengths (the more common input). This case has roughly a 15-45% speedup. Full xcheck passes on x86_64. --- sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S | 1 + sysdeps/x86_64/multiarch/memrchr-avx2.S | 538 ++++++++++---------- 2 files changed, 260 insertions(+), 279 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S index cea2d2a72d..5e9beeeef2 100644 --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S @@ -2,6 +2,7 @@ # define MEMRCHR __memrchr_avx2_rtm #endif +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST #define ZERO_UPPER_VEC_REGISTERS_RETURN \ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S index ba2ce7cb03..6915e1c373 100644 --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S @@ -21,340 +21,320 @@ # include # ifndef MEMRCHR -# define MEMRCHR __memrchr_avx2 +# define MEMRCHR __memrchr_avx2 # endif # ifndef VZEROUPPER -# define VZEROUPPER vzeroupper +# define VZEROUPPER vzeroupper # endif +// abf-off # ifndef SECTION # define SECTION(p) p##.avx # endif +// abf-on + +# define VEC_SIZE 32 +# define PAGE_SIZE 4096 + .section SECTION(.text), "ax", @progbits +ENTRY(MEMRCHR) +# ifdef __ILP32__ + /* Clear upper bits. */ + and %RDX_LP, %RDX_LP +# else + test %RDX_LP, %RDX_LP +# endif + jz L(zero_0) -# define VEC_SIZE 32 - - .section SECTION(.text),"ax",@progbits -ENTRY (MEMRCHR) - /* Broadcast CHAR to YMM0. */ vmovd %esi, %xmm0 - vpbroadcastb %xmm0, %ymm0 - - sub $VEC_SIZE, %RDX_LP - jbe L(last_vec_or_less) - - add %RDX_LP, %RDI_LP + /* Get end pointer. Minus one for two reasons. 1) It is necessary for a + correct page cross check and 2) it correctly sets up end ptr to be + subtract by lzcnt aligned. */ + leaq -1(%rdx, %rdi), %rax - /* Check the last VEC_SIZE bytes. */ - vpcmpeqb (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x0) - - subq $(VEC_SIZE * 4), %rdi - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx - jz L(aligned_more) + vpbroadcastb %xmm0, %ymm0 - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - addq $VEC_SIZE, %rdx - andq $-VEC_SIZE, %rdi - subq %rcx, %rdx + /* Check if we can load 1x VEC without cross a page. */ + testl $(PAGE_SIZE - VEC_SIZE), %eax + jz L(page_cross) + + vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + cmpq $VEC_SIZE, %rdx + ja L(more_1x_vec) + +L(ret_vec_x0_test): + /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which + will gurantee edx (len) is less than it. */ + lzcntl %ecx, %ecx + + /* Hoist vzeroupper (not great for RTM) to save code size. This allows + all logic for edx (len) <= VEC_SIZE to fit in first cache line. */ + COND_VZEROUPPER + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax + ret - .p2align 4 -L(aligned_more): - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) - - /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(last_vec_x1) - - vpcmpeqb (%rdi), %ymm0, %ymm4 - vpmovmskb %ymm4, %eax - testl %eax, %eax - jnz L(last_vec_x0) - - /* Align data to 4 * VEC_SIZE for loop with fewer branches. - There are some overlaps with above if data isn't aligned - to 4 * VEC_SIZE. */ - movl %edi, %ecx - andl $(VEC_SIZE * 4 - 1), %ecx - jz L(loop_4x_vec) - - addq $(VEC_SIZE * 4), %rdi - addq $(VEC_SIZE * 4), %rdx - andq $-(VEC_SIZE * 4), %rdi - subq %rcx, %rdx + /* Fits in aligning bytes of first cache line. */ +L(zero_0): + xorl %eax, %eax + ret - .p2align 4 -L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - subq $(VEC_SIZE * 4), %rdi - subq $(VEC_SIZE * 4), %rdx - jbe L(last_4x_vec_or_less) - - vmovdqa (%rdi), %ymm1 - vmovdqa VEC_SIZE(%rdi), %ymm2 - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 - - vpcmpeqb %ymm1, %ymm0, %ymm1 - vpcmpeqb %ymm2, %ymm0, %ymm2 - vpcmpeqb %ymm3, %ymm0, %ymm3 - vpcmpeqb %ymm4, %ymm0, %ymm4 - - vpor %ymm1, %ymm2, %ymm5 - vpor %ymm3, %ymm4, %ymm6 - vpor %ymm5, %ymm6, %ymm5 - - vpmovmskb %ymm5, %eax - testl %eax, %eax - jz L(loop_4x_vec) - - /* There is a match. */ - vpmovmskb %ymm4, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(last_vec_x1) - - vpmovmskb %ymm1, %eax - bsrl %eax, %eax - addq %rdi, %rax + .p2align 4,, 9 +L(ret_vec_x0): + lzcntl %ecx, %ecx + subq %rcx, %rax L(return_vzeroupper): ZERO_UPPER_VEC_REGISTERS_RETURN - .p2align 4 -L(last_4x_vec_or_less): - addl $(VEC_SIZE * 4), %edx - cmpl $(VEC_SIZE * 2), %edx - jbe L(last_2x_vec) - - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 - vpmovmskb %ymm2, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 - vpmovmskb %ymm3, %eax - testl %eax, %eax - jnz L(last_vec_x1_check) - cmpl $(VEC_SIZE * 3), %edx - jbe L(zero) - - vpcmpeqb (%rdi), %ymm0, %ymm4 - vpmovmskb %ymm4, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - subq $(VEC_SIZE * 4), %rdx - addq %rax, %rdx - jl L(zero) - addq %rdi, %rax - VZEROUPPER_RETURN - - .p2align 4 + .p2align 4,, 10 +L(more_1x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x0) + + /* Align rax (string pointer). */ + andq $-VEC_SIZE, %rax + + /* Recompute remaining length after aligning. */ + movq %rax, %rdx + /* Need this comparison next no matter what. */ + vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1 + subq %rdi, %rdx + decq %rax + vpmovmskb %ymm1, %ecx + /* Fall through for short (hotter than length). */ + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) L(last_2x_vec): - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(last_vec_x3_check) cmpl $VEC_SIZE, %edx - jbe L(zero) - - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - subq $(VEC_SIZE * 2), %rdx - addq %rax, %rdx - jl L(zero) - addl $(VEC_SIZE * 2), %eax - addq %rdi, %rax - VZEROUPPER_RETURN - - .p2align 4 -L(last_vec_x0): - bsrl %eax, %eax - addq %rdi, %rax - VZEROUPPER_RETURN + jbe L(ret_vec_x0_test) + + testl %ecx, %ecx + jnz L(ret_vec_x0) + + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + /* 64-bit lzcnt. This will naturally add 32 to position. */ + lzcntq %rcx, %rcx + COND_VZEROUPPER + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax + ret - .p2align 4 -L(last_vec_x1): - bsrl %eax, %eax - addl $VEC_SIZE, %eax - addq %rdi, %rax - VZEROUPPER_RETURN - .p2align 4 -L(last_vec_x2): - bsrl %eax, %eax - addl $(VEC_SIZE * 2), %eax - addq %rdi, %rax + /* Inexpensive place to put this regarding code size / target alignments + / ICache NLP. Necessary for 2-byte encoding of jump to page cross + case which inturn in necessray for hot path (len <= VEC_SIZE) to fit + in first cache line. */ +L(page_cross): + movq %rax, %rsi + andq $-VEC_SIZE, %rsi + vpcmpeqb (%rsi), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + /* Shift out negative alignment (because we are starting from endptr and + working backwards). */ + movl %eax, %r8d + /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ + notl %r8d + shlxl %r8d, %ecx, %ecx + cmpq %rdi, %rsi + ja L(more_1x_vec) + lzcntl %ecx, %ecx + COND_VZEROUPPER + cmpl %ecx, %edx + jle L(zero_0) + subq %rcx, %rax + ret + .p2align 4,, 11 +L(ret_vec_x1): + /* This will naturally add 32 to position. */ + lzcntq %rcx, %rcx + subq %rcx, %rax VZEROUPPER_RETURN + .p2align 4,, 10 +L(more_2x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x0) - .p2align 4 -L(last_vec_x3): - bsrl %eax, %eax - addl $(VEC_SIZE * 3), %eax - addq %rdi, %rax - ret + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1) - .p2align 4 -L(last_vec_x1_check): - bsrl %eax, %eax - subq $(VEC_SIZE * 3), %rdx - addq %rax, %rdx - jl L(zero) - addl $VEC_SIZE, %eax - addq %rdi, %rax - VZEROUPPER_RETURN - .p2align 4 -L(last_vec_x3_check): - bsrl %eax, %eax - subq $VEC_SIZE, %rdx - addq %rax, %rdx - jl L(zero) - addl $(VEC_SIZE * 3), %eax - addq %rdi, %rax - VZEROUPPER_RETURN + /* Needed no matter what. */ + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - .p2align 4 -L(zero): - xorl %eax, %eax - VZEROUPPER_RETURN + subq $(VEC_SIZE * 4), %rdx + ja L(more_4x_vec) + + cmpl $(VEC_SIZE * -1), %edx + jle L(ret_vec_x2_test) + +L(last_vec): + testl %ecx, %ecx + jnz L(ret_vec_x2) + + /* Needed no matter what. */ + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 3), %rax + COND_VZEROUPPER + subq %rcx, %rax + cmpq %rax, %rdi + ja L(zero_2) + ret - .p2align 4 -L(null): + /* First in aligning bytes. */ +L(zero_2): xorl %eax, %eax ret - .p2align 4 -L(last_vec_or_less_aligned): - movl %edx, %ecx + .p2align 4,, 4 +L(ret_vec_x2_test): + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 2), %rax + COND_VZEROUPPER + subq %rcx, %rax + cmpq %rax, %rdi + ja L(zero_2) + ret - vpcmpeqb (%rdi), %ymm0, %ymm1 - movl $1, %edx - /* Support rdx << 32. */ - salq %cl, %rdx - subq $1, %rdx + .p2align 4,, 11 +L(ret_vec_x2): + /* ecx must be non-zero. */ + bsrl %ecx, %ecx + leaq (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax + VZEROUPPER_RETURN - vpmovmskb %ymm1, %eax + .p2align 4,, 14 +L(ret_vec_x3): + /* ecx must be non-zero. */ + bsrl %ecx, %ecx + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax + VZEROUPPER_RETURN - /* Remove the trailing bytes. */ - andl %edx, %eax - testl %eax, %eax - jz L(zero) - bsrl %eax, %eax - addq %rdi, %rax - VZEROUPPER_RETURN .p2align 4 -L(last_vec_or_less): - addl $VEC_SIZE, %edx +L(more_4x_vec): + testl %ecx, %ecx + jnz L(ret_vec_x2) - /* Check for zero length. */ - testl %edx, %edx - jz L(null) + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx - jz L(last_vec_or_less_aligned) + testl %ecx, %ecx + jnz L(ret_vec_x3) - movl %ecx, %esi - movl %ecx, %r8d - addl %edx, %esi - andq $-VEC_SIZE, %rdi + /* Check if near end before re-aligning (otherwise might do an + unnecissary loop iteration). */ + addq $-(VEC_SIZE * 4), %rax + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec) - subl $VEC_SIZE, %esi - ja L(last_vec_2x_aligned) + /* Align rax to (VEC_SIZE - 1). */ + orq $(VEC_SIZE * 4 - 1), %rax + movq %rdi, %rdx + /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because + lengths that overflow can be valid and break the comparison. */ + orq $(VEC_SIZE * 4 - 1), %rdx - /* Check the last VEC. */ - vpcmpeqb (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - - /* Remove the leading and trailing bytes. */ - sarl %cl, %eax - movl %edx, %ecx + .p2align 4 +L(loop_4x_vec): + /* Need this comparison next no matter what. */ + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2 + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3 + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4 - movl $1, %edx - sall %cl, %edx - subl $1, %edx + vpor %ymm1, %ymm2, %ymm2 + vpor %ymm3, %ymm4, %ymm4 + vpor %ymm2, %ymm4, %ymm4 + vpmovmskb %ymm4, %esi - andl %edx, %eax - testl %eax, %eax - jz L(zero) + testl %esi, %esi + jnz L(loop_end) - bsrl %eax, %eax - addq %rdi, %rax - addq %r8, %rax - VZEROUPPER_RETURN + addq $(VEC_SIZE * -4), %rax + cmpq %rdx, %rax + jne L(loop_4x_vec) - .p2align 4 -L(last_vec_2x_aligned): - movl %esi, %ecx + subl %edi, %edx + incl %edx - /* Check the last VEC. */ - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 +L(last_4x_vec): + /* Used no matter what. */ + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - movl $1, %edx - sall %cl, %edx - subl $1, %edx + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) - vpmovmskb %ymm1, %eax + testl %ecx, %ecx + jnz L(ret_vec_x0_end) - /* Remove the trailing bytes. */ - andl %edx, %eax + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1_end) - testl %eax, %eax - jnz L(last_vec_x1) + /* Used no matter what. */ + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 + vpmovmskb %ymm1, %ecx - /* Check the second last VEC. */ - vpcmpeqb (%rdi), %ymm0, %ymm1 + cmpl $(VEC_SIZE * 3), %edx + ja L(last_vec) + + lzcntl %ecx, %ecx + subq $(VEC_SIZE * 2), %rax + COND_VZEROUPPER + subq %rcx, %rax + cmpq %rax, %rdi + jbe L(ret0) + xorl %eax, %eax +L(ret0): + ret - movl %r8d, %ecx - vpmovmskb %ymm1, %eax + .p2align 4 +L(loop_end): + vpmovmskb %ymm1, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x0_end) + + vpmovmskb %ymm2, %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1_end) + + vpmovmskb %ymm3, %ecx + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) + then it won't affect the result in esi (VEC4). If ecx is non-zero + then CHAR in VEC3 and bsrq will use that position. */ + salq $32, %rcx + orq %rsi, %rcx + bsrq %rcx, %rcx + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax + VZEROUPPER_RETURN - /* Remove the leading bytes. Must use unsigned right shift for - bsrl below. */ - shrl %cl, %eax - testl %eax, %eax - jz L(zero) + .p2align 4,, 4 +L(ret_vec_x1_end): + /* 64-bit version will automatically add 32 (VEC_SIZE). */ + lzcntq %rcx, %rcx + subq %rcx, %rax + VZEROUPPER_RETURN - bsrl %eax, %eax - addq %rdi, %rax - addq %r8, %rax + .p2align 4,, 4 +L(ret_vec_x0_end): + lzcntl %ecx, %ecx + subq %rcx, %rax VZEROUPPER_RETURN -END (MEMRCHR) + + /* 2 bytes until next cache line. */ +END(MEMRCHR) #endif From patchwork Mon Jun 6 22:37:25 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 1639655 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: bilbo.ozlabs.org; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.a=rsa-sha256 header.s=default header.b=FvGI/C/1; dkim-atps=neutral Authentication-Results: ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=sourceware.org (client-ip=8.43.85.97; helo=sourceware.org; envelope-from=libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Received: from sourceware.org (server2.sourceware.org [8.43.85.97]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by bilbo.ozlabs.org (Postfix) with ESMTPS id 4LH7kz6ZzTz9sFs for ; Tue, 7 Jun 2022 08:41:31 +1000 (AEST) Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 143B938303E5 for ; Mon, 6 Jun 2022 22:41:30 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 143B938303E5 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1654555290; bh=Oj1cuLHazeZ+lZshe0FQToSGp/NmDOCaHna+8p/83ck=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=FvGI/C/1y9objTu7OGldqNxzbXb/u7BQK66sp9Lnk1QtPb126CP5pj7h5x/P0mHSE n+7S4Hm0ia946Qs69WZhZT1uLsALEVSOjsDizepsYAuUaAuiLMdwLDFyEUWvYNUbmi PyXRaYFemiJLm7hct973Vw3DYoliDPVvPao+gG5c= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pg1-x532.google.com (mail-pg1-x532.google.com [IPv6:2607:f8b0:4864:20::532]) by sourceware.org (Postfix) with ESMTPS id 8B66438303E2 for ; Mon, 6 Jun 2022 22:37:42 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 8B66438303E2 Received: by mail-pg1-x532.google.com with SMTP id c18so5797443pgh.11 for ; Mon, 06 Jun 2022 15:37:42 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=Oj1cuLHazeZ+lZshe0FQToSGp/NmDOCaHna+8p/83ck=; b=PXgaUKl3EgNRjVPWSz9CrBJCZqsDMv6Y6NbE5AFHw45IjCWLlts+lcBxLp+6pKi9RI pqC9hqlL1F0fdeJx+Cg/ZgwBiNEimwBBv6DFL3cYQBZ4piG+s4kLsESt8fFrtxvyrhbk ZBlrUOMfVu1Z070JJRdfTSE1VYV1M4zsf7RFoJjmjevwbajaefChl0WKHcXncCm8tQ4u Wx6TqssIXlvzVujzxu5JVKQqr5Hr+z5rYXjGiHnRkB69o73VmahC8Tk3W6F+EgRtRevf xNWX85nF/+sfTlN4GUjw9Bo+PF9fqoLNvYlCUkdeSQsmMF1p3o7rUO2ea4jTGRJh7YP8 Rg7Q== X-Gm-Message-State: AOAM530IUfNbGu4wrz4v9HiOSWLEBOKS6s4ocszs72b5xdkLKUdR080D cR8jB8NgSfUM0INyhoxB8oF2ku6GWlQ= X-Google-Smtp-Source: ABdhPJwSClZnU00ARLbIhP3u31ZlGeMNfNbFrHYurgrBLuxvLwLUIuIZZBvMlzH853mH11kjkKHwCw== X-Received: by 2002:a05:6a00:996:b0:505:b6d2:abc8 with SMTP id u22-20020a056a00099600b00505b6d2abc8mr93703499pfg.11.1654555061358; Mon, 06 Jun 2022 15:37:41 -0700 (PDT) Received: from noah-tgl.. ([2600:1010:b04a:6ef:921e:3108:9361:2ef8]) by smtp.gmail.com with ESMTPSA id e22-20020a17090a4a1600b001e345c579d5sm10417532pjh.26.2022.06.06.15.37.40 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 06 Jun 2022 15:37:41 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v4 7/8] x86: Shrink code size of memchr-avx2.S Date: Mon, 6 Jun 2022 15:37:25 -0700 Message-Id: <20220606223726.2082226-7-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20220606223726.2082226-1-goldstein.w.n@gmail.com> References: <20220603044229.2180216-2-goldstein.w.n@gmail.com> <20220606223726.2082226-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-11.9 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org Sender: "Libc-alpha" This is not meant as a performance optimization. The previous code was far to liberal in aligning targets and wasted code size unnecissarily. The total code size saving is: 59 bytes There are no major changes in the benchmarks. Geometric Mean of all benchmarks New / Old: 0.967 Full xcheck passes on x86_64. --- sysdeps/x86_64/multiarch/memchr-avx2-rtm.S | 1 + sysdeps/x86_64/multiarch/memchr-avx2.S | 109 +++++++++++---------- 2 files changed, 60 insertions(+), 50 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S index 87b076c7c4..c4d71938c5 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S @@ -2,6 +2,7 @@ # define MEMCHR __memchr_avx2_rtm #endif +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST #define ZERO_UPPER_VEC_REGISTERS_RETURN \ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S index 75bd7262e0..28a01280ec 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S @@ -57,7 +57,7 @@ # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) .section SECTION(.text),"ax",@progbits -ENTRY (MEMCHR) +ENTRY_P2ALIGN (MEMCHR, 5) # ifndef USE_AS_RAWMEMCHR /* Check for zero length. */ # ifdef __ILP32__ @@ -87,12 +87,14 @@ ENTRY (MEMCHR) # endif testl %eax, %eax jz L(aligned_more) - tzcntl %eax, %eax + bsfl %eax, %eax addq %rdi, %rax - VZEROUPPER_RETURN +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + # ifndef USE_AS_RAWMEMCHR - .p2align 5 + .p2align 4 L(first_vec_x0): /* Check if first match was before length. */ tzcntl %eax, %eax @@ -100,58 +102,31 @@ L(first_vec_x0): /* NB: Multiply length by 4 to get byte count. */ sall $2, %edx # endif - xorl %ecx, %ecx + COND_VZEROUPPER + /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch + block. branch here as opposed to cmovcc is not that costly. Common + usage of memchr is to check if the return was NULL (if string was + known to contain CHAR user would use rawmemchr). This branch will be + highly correlated with the user branch and can be used by most + modern branch predictors to predict the user branch. */ cmpl %eax, %edx - leaq (%rdi, %rax), %rax - cmovle %rcx, %rax - VZEROUPPER_RETURN - -L(null): - xorl %eax, %eax - ret -# endif - .p2align 4 -L(cross_page_boundary): - /* Save pointer before aligning as its original value is - necessary for computer return address if byte is found or - adjusting length if it is not and this is memchr. */ - movq %rdi, %rcx - /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr - and rdi for rawmemchr. */ - orq $(VEC_SIZE - 1), %ALGN_PTR_REG - VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax -# ifndef USE_AS_RAWMEMCHR - /* Calculate length until end of page (length checked for a - match). */ - leaq 1(%ALGN_PTR_REG), %rsi - subq %RRAW_PTR_REG, %rsi -# ifdef USE_AS_WMEMCHR - /* NB: Divide bytes by 4 to get wchar_t count. */ - shrl $2, %esi -# endif -# endif - /* Remove the leading bytes. */ - sarxl %ERAW_PTR_REG, %eax, %eax -# ifndef USE_AS_RAWMEMCHR - /* Check the end of data. */ - cmpq %rsi, %rdx - jbe L(first_vec_x0) + jle L(null) + addq %rdi, %rax + ret # endif - testl %eax, %eax - jz L(cross_page_continue) - tzcntl %eax, %eax - addq %RRAW_PTR_REG, %rax -L(return_vzeroupper): - ZERO_UPPER_VEC_REGISTERS_RETURN - .p2align 4 + .p2align 4,, 10 L(first_vec_x1): - tzcntl %eax, %eax + bsfl %eax, %eax incq %rdi addq %rdi, %rax VZEROUPPER_RETURN - +# ifndef USE_AS_RAWMEMCHR + /* First in aligning bytes here. */ +L(null): + xorl %eax, %eax + ret +# endif .p2align 4 L(first_vec_x2): tzcntl %eax, %eax @@ -340,7 +315,7 @@ L(first_vec_x1_check): incq %rdi addq %rdi, %rax VZEROUPPER_RETURN - .p2align 4 + .p2align 4,, 6 L(set_zero_end): xorl %eax, %eax VZEROUPPER_RETURN @@ -428,5 +403,39 @@ L(last_vec_x3): VZEROUPPER_RETURN # endif + .p2align 4 +L(cross_page_boundary): + /* Save pointer before aligning as its original value is necessary for + computer return address if byte is found or adjusting length if it + is not and this is memchr. */ + movq %rdi, %rcx + /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi for + rawmemchr. */ + andq $-VEC_SIZE, %ALGN_PTR_REG + VPCMPEQ (%ALGN_PTR_REG), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax +# ifndef USE_AS_RAWMEMCHR + /* Calculate length until end of page (length checked for a match). */ + leal VEC_SIZE(%ALGN_PTR_REG), %esi + subl %ERAW_PTR_REG, %esi +# ifdef USE_AS_WMEMCHR + /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %esi +# endif +# endif + /* Remove the leading bytes. */ + sarxl %ERAW_PTR_REG, %eax, %eax +# ifndef USE_AS_RAWMEMCHR + /* Check the end of data. */ + cmpq %rsi, %rdx + jbe L(first_vec_x0) +# endif + testl %eax, %eax + jz L(cross_page_continue) + bsfl %eax, %eax + addq %RRAW_PTR_REG, %rax + VZEROUPPER_RETURN + + END (MEMCHR) #endif From patchwork Mon Jun 6 22:37:26 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 1639657 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: bilbo.ozlabs.org; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.a=rsa-sha256 header.s=default header.b=ytKjHJBx; dkim-atps=neutral Authentication-Results: ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=sourceware.org (client-ip=2620:52:3:1:0:246e:9693:128c; helo=sourceware.org; envelope-from=libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Received: from sourceware.org (server2.sourceware.org [IPv6:2620:52:3:1:0:246e:9693:128c]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by bilbo.ozlabs.org (Postfix) with ESMTPS id 4LH7mc2HNdz9sFs for ; Tue, 7 Jun 2022 08:42:56 +1000 (AEST) Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 2F12B3839C4C for ; Mon, 6 Jun 2022 22:42:54 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 2F12B3839C4C DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1654555374; bh=g51rtSFrtc5/esRF2nnwl2fDrP9rbKMdQCpYoYv3ndg=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=ytKjHJBxoZ1qdyfW2UUdMLZVXftNwCQwWIWiQukP17t997+9q5ZMJb6cgfrs8gF5y X/K6FXbZWnLdKUPrlaGpb1vyrchwa6CUAlHZyi4bZlhVM7uLznthA26uRxVZjggYEv ePwkEi1jyjZET3h2zDfdcFQfJ/zMciaN+7riRmCg= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pj1-x102d.google.com (mail-pj1-x102d.google.com [IPv6:2607:f8b0:4864:20::102d]) by sourceware.org (Postfix) with ESMTPS id C386238303E8 for ; Mon, 6 Jun 2022 22:37:43 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org C386238303E8 Received: by mail-pj1-x102d.google.com with SMTP id e9so3467877pju.5 for ; Mon, 06 Jun 2022 15:37:43 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=g51rtSFrtc5/esRF2nnwl2fDrP9rbKMdQCpYoYv3ndg=; b=L+Vceti11fptqKeuZxbQQ0N5oXUp2wZaB3Zqg4pQGazldsCE7Q5GIhk1qEwI4POA8m ipCxfKFDQo132S4Yq3UMhjJw2ctolBmT9NO4fNDWOA+vg1t41oTbvGeXJzCKSOWaaeyf RmUGDbx9KYvcZzFdJaoQsK1qvhLP/KSimn2lV/LBTNqfRBxB4y1HRhYhp3enwRKsBM5s uLIYJUJRcMaobNvvBvTG8FDvWlcG4lmGF2KDpKFmshHkkAUqiLM4R/uETvGMJ0y+RvxU zhc5Bx5pues+F9+jx2BZwsfzMvPiChuJLR0/RNsRzT/uqljoB/Wr8XdRMWISwCuAhbo8 m2kA== X-Gm-Message-State: AOAM533ibIITqiOn6H0TEgSOAVeL4MZC7pmpGOqm/mLEf1VRhI51/wFJ USh5FexsfGqij7K855AX3zZ3RQB0Iz0= X-Google-Smtp-Source: ABdhPJyLMelAj/dQlmGOqL6g/orV+n6NywSeIPyMtZ0sCBEHTVyIDYnBbBJFxJ0iw+qTbxN7ubppWg== X-Received: by 2002:a17:902:cf0f:b0:15a:24e0:d9b0 with SMTP id i15-20020a170902cf0f00b0015a24e0d9b0mr25755108plg.42.1654555062637; Mon, 06 Jun 2022 15:37:42 -0700 (PDT) Received: from noah-tgl.. ([2600:1010:b04a:6ef:921e:3108:9361:2ef8]) by smtp.gmail.com with ESMTPSA id e22-20020a17090a4a1600b001e345c579d5sm10417532pjh.26.2022.06.06.15.37.41 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 06 Jun 2022 15:37:42 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v4 8/8] x86: Shrink code size of memchr-evex.S Date: Mon, 6 Jun 2022 15:37:26 -0700 Message-Id: <20220606223726.2082226-8-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20220606223726.2082226-1-goldstein.w.n@gmail.com> References: <20220603044229.2180216-2-goldstein.w.n@gmail.com> <20220606223726.2082226-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.0 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+incoming=patchwork.ozlabs.org@sourceware.org Sender: "Libc-alpha" This is not meant as a performance optimization. The previous code was far to liberal in aligning targets and wasted code size unnecissarily. The total code size saving is: 64 bytes There are no non-negligible changes in the benchmarks. Geometric Mean of all benchmarks New / Old: 1.000 Full xcheck passes on x86_64. --- sysdeps/x86_64/multiarch/memchr-evex.S | 46 ++++++++++++++------------ 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S index cfaf02907d..0fd11b7632 100644 --- a/sysdeps/x86_64/multiarch/memchr-evex.S +++ b/sysdeps/x86_64/multiarch/memchr-evex.S @@ -88,7 +88,7 @@ # define PAGE_SIZE 4096 .section SECTION(.text),"ax",@progbits -ENTRY (MEMCHR) +ENTRY_P2ALIGN (MEMCHR, 6) # ifndef USE_AS_RAWMEMCHR /* Check for zero length. */ test %RDX_LP, %RDX_LP @@ -131,22 +131,24 @@ L(zero): xorl %eax, %eax ret - .p2align 5 + .p2align 4 L(first_vec_x0): - /* Check if first match was before length. */ - tzcntl %eax, %eax - xorl %ecx, %ecx - cmpl %eax, %edx - leaq (%rdi, %rax, CHAR_SIZE), %rax - cmovle %rcx, %rax + /* Check if first match was before length. NB: tzcnt has false data- + dependency on destination. eax already had a data-dependency on esi + so this should have no affect here. */ + tzcntl %eax, %esi +# ifdef USE_AS_WMEMCHR + leaq (%rdi, %rsi, CHAR_SIZE), %rdi +# else + addq %rsi, %rdi +# endif + xorl %eax, %eax + cmpl %esi, %edx + cmovg %rdi, %rax ret -# else - /* NB: first_vec_x0 is 17 bytes which will leave - cross_page_boundary (which is relatively cold) close enough - to ideal alignment. So only realign L(cross_page_boundary) if - rawmemchr. */ - .p2align 4 # endif + + .p2align 4 L(cross_page_boundary): /* Save pointer before aligning as its original value is necessary for computer return address if byte is found or @@ -400,10 +402,14 @@ L(last_2x_vec): L(zero_end): ret +L(set_zero_end): + xorl %eax, %eax + ret .p2align 4 L(first_vec_x1_check): - tzcntl %eax, %eax + /* eax must be non-zero. Use bsfl to save code size. */ + bsfl %eax, %eax /* Adjust length. */ subl $-(CHAR_PER_VEC * 4), %edx /* Check if match within remaining length. */ @@ -412,9 +418,6 @@ L(first_vec_x1_check): /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax ret -L(set_zero_end): - xorl %eax, %eax - ret .p2align 4 L(loop_4x_vec_end): @@ -464,7 +467,7 @@ L(loop_4x_vec_end): # endif ret - .p2align 4 + .p2align 4,, 10 L(last_vec_x1_return): tzcntl %eax, %eax # if defined USE_AS_WMEMCHR || RET_OFFSET != 0 @@ -496,6 +499,7 @@ L(last_vec_x3_return): # endif # ifndef USE_AS_RAWMEMCHR + .p2align 4,, 5 L(last_4x_vec_or_less_cmpeq): VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 kmovd %k0, %eax @@ -546,7 +550,7 @@ L(last_4x_vec): # endif andl %ecx, %eax jz L(zero_end2) - tzcntl %eax, %eax + bsfl %eax, %eax leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax L(zero_end2): ret @@ -562,6 +566,6 @@ L(last_vec_x3): leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ret # endif - + /* 7 bytes from next cache line. */ END (MEMCHR) #endif