From patchwork Wed Apr 20 20:55:15 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Gabriel F. T. Gomes" X-Patchwork-Id: 612842 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3qqvKh5nvJz9syq for ; Thu, 21 Apr 2016 06:55:36 +1000 (AEST) Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.b=Dj+6u9rE; dkim-atps=neutral DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:from:to:cc:subject:date:message-id; q=dns; s= default; b=Ib4Ga9hYww4B1h1m9U+KcgGbNg29Tu70nheoVmghrqhnQy9xoBfLt PDTONwyFe0bqs8wn6fbq/Grx/9z8X1c8/68NLtyD+Eh1xV1dqyquHtee8loegdkb dRfJdHUutN1p+srfEn8ceBu2K50j0axej1GDe766ldEeN0aAHxIouM= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:from:to:cc:subject:date:message-id; s=default; bh=1y2eYoyb61hWdxYV5+5aMAgxrU8=; b=Dj+6u9rEME7VWuHoebCHa7iCW9fL w+pshFqLWAwpvyysdoqvM2h9dQxPzrO6+jkbFMZ+5+wGIAp1vx6ZoJ4SELXHwpFZ I7d0e+mtEFXJVWeUpe1+TCh0hapXJdSb9NZ2NdknAOHHCBAJCsXRleyi2UvwMVqa DDTo6pR1Aw3K0Sw= Received: (qmail 11947 invoked by alias); 20 Apr 2016 20:55:30 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 11321 invoked by uid 89); 20 Apr 2016 20:55:29 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.9 required=5.0 tests=BAYES_00, KAM_LAZY_DOMAIN_SECURITY, RP_MATCHES_RCVD autolearn=ham version=3.3.2 spammy=gomes, Gomes, Hx-languages-length:4684, gabriel X-HELO: e24smtp04.br.ibm.com X-IBM-Helo: d24dlp02.br.ibm.com X-IBM-MailFrom: gftg@linux.vnet.ibm.com X-IBM-RcptTo: libc-alpha@sourceware.org From: "Gabriel F. T. Gomes" To: libc-alpha@sourceware.org Cc: munroesj@linux.vnet.ibm.com, tuliom@linux.vnet.ibm.com Subject: [PATCH 1/2] powerpc: Zero pad using memset in strncpy/stpncpy Date: Wed, 20 Apr 2016 17:55:15 -0300 Message-Id: <1461185716-7332-1-git-send-email-gftg@linux.vnet.ibm.com> X-TM-AS-MML: disable X-Content-Scanned: Fidelis XPS MAILER x-cbid: 16042020-0029-0000-0000-00000B6429DE Call __memset_power8 to pad, with zeros, the remaining bytes in the dest string on __strncpy_power8 and __stpncpy_power8. This improves performance when n is larger than the input string, giving ~30% gain for larger strings without impacting much shorter strings. 2016-04-18 Gabriel F. T. Gomes * sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the remaining bytes in the dest string, with zeros. --- sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++---------------- 1 file changed, 56 insertions(+), 67 deletions(-) diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S index 17c3afb..0bb3bd4 100644 --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S @@ -24,6 +24,8 @@ # define FUNC_NAME strncpy #endif +#define FRAMESIZE (FRAME_MIN_SIZE+48) + /* Implements the function char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0) addi r10,r4,16 rlwinm r9,r4,0,19,19 - /* Since it is a leaf function, save some non-volatile registers on the - protected/red zone. */ + /* Save some non-volatile registers on the stack. */ std r26,-48(r1) std r27,-40(r1) @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0) std r30,-16(r1) std r31,-8(r1) + /* Update CFI. */ + cfi_offset(r26, -48) + cfi_offset(r27, -40) + cfi_offset(r28, -32) + cfi_offset(r29, -24) + cfi_offset(r30, -16) + cfi_offset(r31, -8) + beq cr7,L(unaligned_lt_16) rldicl r9,r4,0,61 subfic r8,r9,8 @@ -180,74 +189,58 @@ L(short_path_loop_end): ld r31,-8(r1) blr - /* This code pads the remainder dest with NULL bytes. The algorithm - calculate the remanining size and issues a doubleword unrolled - loops followed by a byte a byte set. */ + /* This code pads the remainder of dest with NULL bytes. The algorithm + calculates the remaining size and calls memset. */ .align 4 L(zero_pad_start): mr r5,r10 mr r9,r6 L(zero_pad_start_1): - srdi. r8,r5,r3 - mr r10,r9 -#ifdef USE_AS_STPNCPY - mr r3,r9 + /* At this point: + - r5 holds the number of bytes that still have to be written to + dest. + - r9 points to the position, in dest, where the first null byte + will be written. + The above statements are true both when control reaches this label + from a branch or when falling through the previous lines. */ +#ifndef USE_AS_STPNCPY + mr r30,r3 /* Save the return value of strncpy. */ +#endif + /* Prepare the call to memset. */ + mr r3,r9 /* Pointer to the area to be zero-filled. */ + li r4,0 /* Byte to be written (zero). */ + + /* We delayed the creation of the stack frame, as well as the saving of + the link register, because only at this point, we are sure that + doing so is actually needed. */ + + /* Save the link register. */ + mflr r0 + std r0,16(r1) + cfi_offset(lr, 16) + + /* Create the stack frame. */ + stdu r1,-FRAMESIZE(r1) + cfi_adjust_cfa_offset(FRAMESIZE) + + bl __memset_power8 + nop + + /* Restore the stack frame. */ + addi r1,r1,FRAMESIZE + cfi_adjust_cfa_offset(-FRAMESIZE) + /* Restore the link register. */ + ld r0,16(r1) + mtlr r0 + +#ifndef USE_AS_STPNCPY + mr r3,r30 /* Restore the return value of strncpy, i.e.: + dest. For stpncpy, the return value is the + same as return value of memset. */ #endif - beq- cr0,L(zero_pad_loop_b_start) - cmpldi cr7,r8,1 - li cr7,0 - std r7,0(r9) - beq cr7,L(zero_pad_loop_b_prepare) - addic. r8,r8,-2 - addi r10,r9,r16 - std r7,8(r9) - beq cr0,L(zero_pad_loop_dw_2) - std r7,16(r9) - li r9,0 - b L(zero_pad_loop_dw_1) - - .align 4 -L(zero_pad_loop_dw): - addi r10,r10,16 - std r9,-8(r10) - beq cr0,L(zero_pad_loop_dw_2) - std r9,0(r10) -L(zero_pad_loop_dw_1): - cmpldi cr7,r8,1 - std r9,0(r10) - addic. r8,r8,-2 - bne cr7,L(zero_pad_loop_dw) - addi r10,r10,8 -L(zero_pad_loop_dw_2): - rldicl r5,r5,0,61 -L(zero_pad_loop_b_start): - cmpdi cr7,r5,0 - addi r5,r5,-1 - addi r9,r10,-1 - add r10,r10,5 - subf r10,r9,r10 - li r8,0 - beq- cr7,L(short_path_loop_end) - - /* Write remaining 1-8 bytes. */ - .align 4 - addi r9,r9,1 - mtocrf 0x1,r10 - bf 29,4f - stw r8,0(r9) - addi r9,r9,4 - - .align 4 -4: bf 30,2f - sth r8,0(r9) - addi r9,r9,2 - - .align 4 -2: bf 31,1f - stb r8,0(r9) - /* Restore non-volatile registers. */ -1: ld r26,-48(r1) + /* Restore non-volatile registers and return. */ + ld r26,-48(r1) ld r27,-40(r1) ld r28,-32(r1) ld r29,-24(r1) @@ -443,10 +436,6 @@ L(short_path_prepare_2_3): mr r4,r28 mr r9,r29 b L(short_path_2) -L(zero_pad_loop_b_prepare): - addi r10,r9,8 - rldicl r5,r5,0,61 - b L(zero_pad_loop_b_start) L(zero_pad_start_prepare_1): mr r5,r6 mr r9,r8