From patchwork Tue Oct 22 09:38:27 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xuelei Zhang X-Patchwork-Id: 1181182 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=sourceware.org (client-ip=209.132.180.131; helo=sourceware.org; envelope-from=libc-alpha-return-106174-incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Authentication-Results: ozlabs.org; dmarc=none (p=none dis=none) header.from=huawei.com Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.b="BXMf+QsM"; dkim-atps=neutral Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 46y7k614CQz9sP4 for ; Tue, 22 Oct 2019 20:38:53 +1100 (AEDT) DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:from:to:subject:date:message-id:mime-version :content-type; q=dns; s=default; b=GW+ZpY13WRG+GQx5o7yx8X5oZHHDI dMCkruRkPhLx2M9ysCeGW6beCE8cBFRtHk6s+Gan5CB8syXGtv0/vIBhZcUl+rU0 reJ9IyuGwzWPtl1d27T0ADNe65klDfFGzqtutzJo5mtpPBpnZYnD55MVS4q5ymen Ry3eJweVgv8Jpo= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:from:to:subject:date:message-id:mime-version :content-type; s=default; bh=PExGpHCDMW/ht6Yjj2vJu1GmK90=; b=BXM f+QsMbPyWuzvuF2Z5LFWWwxmiiwIc2T5U7/JjRlDH1CBYkDN8c2GGhPqQibMw1Uu TZ9e68miFA4aPICYNSeAIOAXaHceoq2WDTV2x5BpZF1wHKIfCOBHyRT1NkztJT/H mytw77X2KOQaP/VX9ddktCm+8qdSz+YMOoDb21cc= Received: (qmail 12167 invoked by alias); 22 Oct 2019 09:38:47 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 12158 invoked by uid 89); 22 Oct 2019 09:38:46 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-18.8 required=5.0 tests=AWL, BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_MANYTO, SPF_HELO_PASS, SPF_PASS autolearn=ham version=3.3.1 spammy= X-HELO: huawei.com From: Xuelei Zhang To: , , , , , Subject: [PATCH v2] aarch64: Optimized implementation of memcmp Date: Tue, 22 Oct 2019 17:38:27 +0800 Message-ID: <20191022093827.9072-1-zhangxuelei4@huawei.com> MIME-Version: 1.0 The loop body is expanded from a 16-byte comparison to a 64-byte comparison, and the usage of ldp is replaced by the Post-index mode to the Base plus offset mode. Hence, compare can faster 18% around > 128 bytes in all. --- sysdeps/aarch64/memcmp.S | 135 ++++++++++++++++++++++++++++------------------- 1 file changed, 82 insertions(+), 53 deletions(-) diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index f330154c7a0..390336b58f5 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -46,75 +46,83 @@ ENTRY_ALIGN (memcmp, 6) DELOUSE (1) DELOUSE (2) - subs limit, limit, 8 - b.lo L(less8) + subs limit, limit, 16 + b.lo L(less16) - ldr data1, [src1], 8 - ldr data2, [src2], 8 - cmp data1, data2 - b.ne L(return) - - subs limit, limit, 8 - b.gt L(more16) - - ldr data1, [src1, limit] - ldr data2, [src2, limit] - b L(return) - -L(more16): - ldr data1, [src1], 8 - ldr data2, [src2], 8 - cmp data1, data2 - bne L(return) - - /* Jump directly to comparing the last 16 bytes for 32 byte (or less) - strings. */ - subs limit, limit, 16 - b.ls L(last_bytes) + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + ccmp data1, data2, 0, ne + ccmp data1h, data2h, 0, eq + b.ne L(return64) - /* We overlap loads between 0-32 bytes at either side of SRC1 when we - try to align, so limit it only to strings larger than 128 bytes. */ - cmp limit, 96 - b.ls L(loop16) + subs limit, limit, 16 + b.ls L(last_bytes) + cmp limit, 112 + b.lo L(loop16) - /* Align src1 and adjust src2 with bytes not yet done. */ and tmp1, src1, 15 add limit, limit, tmp1 sub src1, src1, tmp1 sub src2, src2, tmp1 + subs limit, limit, 48 - /* Loop performing 16 bytes per iteration using aligned src1. - Limit is pre-decremented by 16 and must be larger than zero. - Exit if <= 16 bytes left to do or if the data is not equal. */ + /* Compare 128 up bytes using aligned access. */ .p2align 4 +L(loop64): + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + ccmp data1h, data2h, 0, eq + b.ne L(return64) + + ldp data1, data1h, [src1, 16] + ldp data2, data2h, [src2, 16] + cmp data1, data2 + ccmp data1h, data2h, 0, eq + b.ne L(return64) + + ldp data1, data1h, [src1, 32] + ldp data2, data2h, [src2, 32] + cmp data1, data2 + ccmp data1h, data2h, 0, eq + b.ne L(return64) + + ldp data1, data1h, [src1, 48] + ldp data2, data2h, [src2, 48] + cmp data1, data2 + ccmp data1h, data2h, 0, eq + b.ne L(return64) + + subs limit, limit, 64 + add src1, src1, 64 + add src2, src2, 64 + b.pl L(loop64) + adds limit, limit, 48 + b.lo L(last_bytes) + L(loop16): ldp data1, data1h, [src1], 16 ldp data2, data2h, [src2], 16 - subs limit, limit, 16 - ccmp data1, data2, 0, hi + cmp data1, data2 ccmp data1h, data2h, 0, eq - b.eq L(loop16) - - cmp data1, data2 - bne L(return) - mov data1, data1h - mov data2, data2h - cmp data1, data2 - bne L(return) + b.ne L(return64) + subs limit, limit, 16 + b.hi L(loop16) /* Compare last 1-16 bytes using unaligned access. */ L(last_bytes): add src1, src1, limit add src2, src2, limit ldp data1, data1h, [src1] ldp data2, data2h, [src2] - cmp data1, data2 + + /* Compare data bytes and set return value to 0, -1 or 1. */ +L(return64): + cmp data1, data2 bne L(return) +L(return_pre): mov data1, data1h mov data2, data2h - cmp data1, data2 - - /* Compare data bytes and set return value to 0, -1 or 1. */ L(return): #ifndef __AARCH64EB__ rev data1, data1 @@ -127,25 +135,46 @@ L(ret_eq): ret .p2align 4 - /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less16): + adds limit, limit, 8 + b.lo L(less8) //lo:< + ldr data1, [src1] + ldr data2, [src2] + /* equal 8 optimized */ + ccmp data1, data2, 0, ne + b.ne L(return) + + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) + + .p2align 4 L(less8): adds limit, limit, 4 b.lo L(less4) - ldr data1w, [src1], 4 - ldr data2w, [src2], 4 - cmp data1w, data2w + ldr data1w, [src1] + ldr data2w, [src2] + ccmp data1, data2, 0, ne b.ne L(return) - sub limit, limit, 4 + ldr data1w, [src1, limit] + ldr data2w, [src2, limit] + b L(return) + + .p2align 4 L(less4): adds limit, limit, 4 - beq L(ret_eq) + beq L(ret_0) + L(byte_loop): ldrb data1w, [src1], 1 ldrb data2w, [src2], 1 subs limit, limit, 1 ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ b.eq L(byte_loop) - sub result, data1w, data2w + sub result, data1w, data2w + ret +L(ret_0): + mov result, 0 ret END (memcmp)