From patchwork Sat Jun 14 07:06:39 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Richard Henderson X-Patchwork-Id: 359756 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id AE91714009F for ; Sat, 14 Jun 2014 17:06:54 +1000 (EST) DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:message-id:date:from:mime-version:to:cc :subject:content-type; q=dns; s=default; b=nIIlmcvgrdEK805COmJ+v gjaz98AMRR6UFHL0KTWuvCNcFVz/0lanbokMt72lNV2D9U+Igqgq2W+Un63QqKUN ybPru/JengTv0k2rQiGa0joUQoWCzW/Cou8XUsXDX8ko6aP+40nWKd205nTLuazO R4EshLQC6cyDh/AXTht4Sk= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:message-id:date:from:mime-version:to:cc :subject:content-type; s=default; bh=5OzuTxcKJXtR62ANBUoMegNUyLw =; b=L+rFUEoF6pO5hDj2CUQ7Tpybaxx0z18jggqDltmNBECTyYpItkqdaxyC9v4 GZkQOMN1D0JN+bo80r1kS0B7INPTmBn6tJv6JJePW6cut3QVZEuFb35UoZtjKplY zaUSsoaU5KfBkfwvRDrn5MHhF01GrUnSnZ3uyLwFV94ezrgA= Received: (qmail 15551 invoked by alias); 14 Jun 2014 07:06:48 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 15541 invoked by uid 89); 14 Jun 2014 07:06:47 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-0.7 required=5.0 tests=AWL, BAYES_60, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, RCVD_IN_DNSWL_LOW, SPF_PASS autolearn=no version=3.3.2 X-HELO: mail-pa0-f51.google.com X-Received: by 10.68.186.130 with SMTP id fk2mr8953473pbc.60.1402729603667; Sat, 14 Jun 2014 00:06:43 -0700 (PDT) Message-ID: <539BF47F.3030907@twiddle.net> Date: Sat, 14 Jun 2014 00:06:39 -0700 From: Richard Henderson User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Thunderbird/24.5.0 MIME-Version: 1.0 To: libc-alpha CC: =?UTF-8?B?T25kxZllaiBCw61sa2E=?= , Marcus Shawcroft Subject: [RFC PATCH] aarch64: improve memset The major idea here is to use IFUNC to check the zva line size once, and use that to select different entry points. This saves 3 branches during startup, and allows significantly more flexibility. Also, I've cribbed several of the unaligned store ideas that Ondrej has done with the x86 versions. I've done some performance testing using cachebench, which suggests that the unrolled memset_zva_64 path is 1.5x faster than the current memset at 1024 bytes and above. The non-zva path appears to be largely unchanged. I'd like to use some of Ondrej's benchmarks+data, but I couldn't locate them in a quick search of the mailing list. Pointers? Comments? r~ diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S index 06f04be..523406d 100644 --- a/sysdeps/aarch64/memset.S +++ b/sysdeps/aarch64/memset.S @@ -20,23 +20,14 @@ * * ARMv8-a, AArch64 * Unaligned accesses - * */ #include -/* By default we assume that the DC instruction can be used to zero - data blocks more efficiently. In some circumstances this might be - unsafe, for example in an asymmetric multiprocessor environment with - different DC clear lengths (neither the upper nor lower lengths are - safe to use). The feature can be disabled by defining DONT_USE_DC. - - If code may be run in a virtualized environment, then define - MAYBE_VIRT. This will cause the code to cache the system register - values rather than re-reading them each call. */ - #define dstin x0 -#define val w1 +#define dstin_w w0 +#define val x1 +#define valw w1 #define count x2 #define tmp1 x3 #define tmp1w w3 @@ -44,186 +35,280 @@ #define tmp2w w4 #define zva_len_x x5 #define zva_len w5 -#define zva_bits_x x6 - -#define A_l x7 -#define A_lw w7 +#define zva_mask_x x6 +#define zva_mask w6 #define dst x8 -#define tmp3w w9 - -ENTRY_ALIGN (__memset, 6) - - mov dst, dstin /* Preserve return value. */ - ands A_lw, val, #255 -#ifndef DONT_USE_DC - b.eq L(zero_mem) -#endif - orr A_lw, A_lw, A_lw, lsl #8 - orr A_lw, A_lw, A_lw, lsl #16 - orr A_l, A_l, A_l, lsl #32 -L(tail_maybe_long): - cmp count, #64 - b.ge L(not_short) -L(tail_maybe_tiny): - cmp count, #15 - b.le L(tail15tiny) -L(tail63): - ands tmp1, count, #0x30 - b.eq L(tail15) - add dst, dst, tmp1 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - stp A_l, A_l, [dst, #-48] -1: - stp A_l, A_l, [dst, #-32] -2: - stp A_l, A_l, [dst, #-16] - -L(tail15): - and count, count, #15 - add dst, dst, count - stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ - RET +#define dst_w w8 +#define dstend x9 + + .globl memset + cfi_startproc + +#if HAVE_IFUNC && !defined (IS_IN_rtld) +/* Rather than decode dczid_el0 every time, checking for zva disabled and + unpacking the line size, do this once in the indirect function and choose + an appropriate entry point which encodes these values as constants. */ -L(tail15tiny): - /* Set up to 15 bytes. Does not assume earlier memory - being set. */ - tbz count, #3, 1f - str A_l, [dst], #8 -1: - tbz count, #2, 1f - str A_lw, [dst], #4 -1: - tbz count, #1, 1f - strh A_lw, [dst], #2 -1: - tbz count, #0, 1f - strb A_lw, [dst] -1: + .type memset, %gnu_indirect_function +memset: + mrs x1, dczid_el0 + adrp x0, 1f + tst x1, #16 /* test for zva disabled */ + and x1, x1, #15 + add x0, x0, #:lo12:1f + csel x1, xzr, x1, ne /* squash index to 0 if so */ + ldrsw x2, [x0, x1, lsl #2] + add x0, x0, x2 RET + .size memset, .-memset + + .section .rodata +1: .long memset_nozva - 1b // 0 + .long memset_nozva - 1b // 1 + .long memset_nozva - 1b // 2 + .long memset_nozva - 1b // 3 + .long memset_zva_64 - 1b // 4 + .long memset_zva_128 - 1b // 5 + .long memset_zva_256 - 1b // 6 + .long memset_zva_512 - 1b // 7 + .long memset_zva_1024 - 1b // 8 + .long memset_zva_2048 - 1b // 9 + .long memset_zva_4096 - 1b // 10 + .long memset_zva_8192 - 1b // 11 + .long memset_zva_16384 - 1b // 12 + .long memset_zva_32768 - 1b // 13 + .long memset_zva_65536 - 1b // 14 + .long memset_zva_131072 - 1b // 15 + .previous + +/* The 64 byte zva size is too small, and needs unrolling for efficiency. */ - /* Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line, this ensures the entire loop is in one line. */ .p2align 6 -L(not_short): - neg tmp2, dst - ands tmp2, tmp2, #15 - b.eq 2f - /* Bring DST to 128-bit (16-byte) alignment. We know that there's - * more than that to set, so we simply store 16 bytes and advance by - * the amount required to reach alignment. */ - sub count, count, tmp2 - stp A_l, A_l, [dst] - add dst, dst, tmp2 - /* There may be less than 63 bytes to go now. */ - cmp count, #63 - b.le L(tail63) -2: - sub dst, dst, #16 /* Pre-bias. */ - sub count, count, #64 -1: - stp A_l, A_l, [dst, #16] - stp A_l, A_l, [dst, #32] - stp A_l, A_l, [dst, #48] - stp A_l, A_l, [dst, #64]! - subs count, count, #64 - b.ge 1b - tst count, #0x3f - add dst, dst, #16 - b.ne L(tail63) + .type memset_zva_64, %function +memset_zva_64: + CALL_MCOUNT + and valw, valw, #255 + cmp count, #256 + ccmp valw, #0, #0, hs /* hs ? cmp val,0 : !z */ + b.ne L(nz_or_small) + + stp xzr, xzr, [dstin] /* first 16 aligned 1. */ + and tmp2, dstin, #-16 + and dst, dstin, #-64 + + stp xzr, xzr, [tmp2, #16] /* first 64 aligned 16. */ + add dstend, dstin, count + add dst, dst, #64 + + stp xzr, xzr, [tmp2, #32] + sub count, dstend, dst /* recompute for misalign */ + add tmp1, dst, #64 + + stp xzr, xzr, [tmp2, #48] + sub count, count, #128 /* pre-bias */ + + stp xzr, xzr, [tmp2, #64] + + .p2align 6,,24 +0: dc zva, dst + subs count, count, #128 + dc zva, tmp1 + add dst, dst, #128 + add tmp1, tmp1, #128 + b.hs 0b + + adds count, count, #128 /* undo pre-bias */ + b.ne L(zva_tail) RET -#ifndef DONT_USE_DC - /* For zeroing memory, check to see if we can use the ZVA feature to - * zero entire 'cache' lines. */ -L(zero_mem): - mov A_l, #0 - cmp count, #63 - b.le L(tail_maybe_tiny) - neg tmp2, dst - ands tmp2, tmp2, #15 - b.eq 1f - sub count, count, tmp2 - stp A_l, A_l, [dst] - add dst, dst, tmp2 - cmp count, #63 - b.le L(tail63) -1: - /* For zeroing small amounts of memory, it's not worth setting up - * the line-clear code. */ - cmp count, #128 - b.lt L(not_short) -#ifdef MAYBE_VIRT - /* For efficiency when virtualized, we cache the ZVA capability. */ - adrp tmp2, L(cache_clear) - ldr zva_len, [tmp2, #:lo12:L(cache_clear)] - tbnz zva_len, #31, L(not_short) - cbnz zva_len, L(zero_by_line) - mrs tmp1, dczid_el0 - tbz tmp1, #4, 1f - /* ZVA not available. Remember this for next time. */ - mov zva_len, #~0 - str zva_len, [tmp2, #:lo12:L(cache_clear)] - b L(not_short) -1: - mov tmp3w, #4 - and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ - lsl zva_len, tmp3w, zva_len - str zva_len, [tmp2, #:lo12:L(cache_clear)] + .size memset_zva_64, . - memset_zva_64 + +/* For larger zva sizes, a simple loop ought to suffice. */ +/* ??? Needs performance testing, when such hardware becomes available. */ + +.macro do_zva len + .p2align 4 + .type memset_zva_\len, %function +memset_zva_\len: + CALL_MCOUNT + and valw, valw, #255 + cmp count, #\len + ccmp valw, #0, #0, hs /* hs ? cmp val,0 : !z */ + b.ne L(nz_or_small) + + add dstend, dstin, count + mov zva_len, #\len + mov zva_mask, #\len-1 + b memset_zva_n + + .size memset_zva_\len, . - memset_zva_\len +.endm + + do_zva 128 // 5 + do_zva 256 // 6 + do_zva 512 // 7 + do_zva 1024 // 8 + do_zva 2048 // 9 + do_zva 4096 // 10 + do_zva 8192 // 11 + do_zva 16384 // 12 + do_zva 32768 // 13 + do_zva 65536 // 14 + do_zva 131072 // 15 + + .p2align 6 #else +/* Without IFUNC, we must load the zva data from the dczid register. */ + + .p2align 6 + .type memset, %function +memset: + and valw, valw, #255 + cmp count, #256 + ccmp valw, #0, #0, hs /* hs ? cmp val,0 : !z */ + b.ne L(nz_or_small) + mrs tmp1, dczid_el0 - tbnz tmp1, #4, L(not_short) - mov tmp3w, #4 - and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ - lsl zva_len, tmp3w, zva_len -#endif - -L(zero_by_line): - /* Compute how far we need to go to become suitably aligned. We're - * already at quad-word alignment. */ + tbnz tmp1, #4, L(nz_or_small) + + and tmp1w, tmp1w, #15 + mov zva_len, #4 + add dstend, dstin, count + lsl zva_len, zva_len, tmp1w cmp count, zva_len_x - b.lt L(not_short) /* Not enough to reach alignment. */ - sub zva_bits_x, zva_len_x, #1 - neg tmp2, dst - ands tmp2, tmp2, zva_bits_x - b.eq 1f /* Already aligned. */ - /* Not aligned, check that there's enough to copy after alignment. */ - sub tmp1, count, tmp2 - cmp tmp1, #64 - ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ - b.lt L(not_short) - /* We know that there's at least 64 bytes to zero and that it's safe - * to overrun by 64 bytes. */ - mov count, tmp1 -2: - stp A_l, A_l, [dst] - stp A_l, A_l, [dst, #16] - stp A_l, A_l, [dst, #32] - subs tmp2, tmp2, #64 - stp A_l, A_l, [dst, #48] - add dst, dst, #64 - b.ge 2b - /* We've overrun a bit, so adjust dst downwards. */ - add dst, dst, tmp2 -1: - sub count, count, zva_len_x -3: - dc zva, dst - add dst, dst, zva_len_x + sub zva_mask, zva_len, #1 + b.lo L(ge_64) + + /* Fall through into memset_zva_n. */ + .size memset, . - memset +#endif /* HAVE_IFUNC */ + +/* Main part of the zva path. On arrival here, we've already checked for + minimum size and that VAL is zero. Also, we've set up zva_len and mask. */ + + .type memset_zva_n, %function +memset_zva_n: + stp xzr, xzr, [dstin] /* first 16 aligned 1. */ + neg tmp1w, dstin_w + sub count, count, zva_len_x /* pre-bias */ + mov dst, dstin + ands tmp1w, tmp1w, zva_mask + b.ne 3f + + .p2align 6,,16 +2: dc zva, dst subs count, count, zva_len_x - b.ge 3b - ands count, count, zva_bits_x - b.ne L(tail_maybe_long) + add dst, dst, zva_len_x + b.hs 2b + + adds count, count, zva_len_x /* undo pre-bias */ + b.ne L(zva_tail) RET -#ifdef MAYBE_VIRT - .bss - .p2align 2 -L(cache_clear): - .space 4 -#endif -#endif /* DONT_USE_DC */ - -END (__memset) -weak_alias (__memset, memset) + + .p2align 4 +3: and tmp2, dstin, #-16 + sub count, count, tmp1 /* account for misalign */ + add dst, dstin, tmp1 + + .p2align 6,,24 +4: stp xzr, xzr, [tmp2, #16] + stp xzr, xzr, [tmp2, #32] + subs tmp1w, tmp1w, #64 + stp xzr, xzr, [tmp2, #48] + stp xzr, xzr, [tmp2, #64]! + b.hi 4b + + b 2b + + .size memset_zva_n, . - memset_zva_n + +/* The non-zva path. */ + + .p2align 6 + .type memset_nozva, %function +memset_nozva: + CALL_MCOUNT + and valw, valw, #255 +L(nz_or_small): + orr valw, valw, valw, lsl #8 /* replicate the byte */ + cmp count, #64 + orr valw, valw, valw, lsl #16 + add dstend, dstin, count /* remember end of buffer */ + orr val, val, val, lsl #32 + b.hs L(ge_64) + + /* Small data -- original count is less than 64 bytes. */ +L(le_63): + cmp count, #16 + b.lo L(le_15) + stp val, val, [dstin] + tbz count, #5, L(le_31) + stp val, val, [dstin, #16] + stp val, val, [dstend, #-32] +L(le_31): + stp val, val, [dstend, #-16] + RET + .p2align 6,,16 +L(le_15): + tbz count, #3, L(le_7) + str val, [dstin] + str val, [dstend, #-8] + RET + .p2align 6,,16 +L(le_7): + tbz count, #2, L(le_3) + str valw, [dstin] + str valw, [dstend, #-4] + RET + .p2align 6,,20 +L(le_3): + tbz count, #1, L(le_1) + strh valw, [dstend, #-2] +L(le_1): + tbz count, #0, L(le_0) + strb valw, [dstin] +L(le_0): + RET + + .p2align 6 +L(ge_64): + and dst, dstin, #-16 /* align the pointer / pre-bias. */ + stp val, val, [dstin] /* first 16 align 1 */ + sub count, dstend, dst /* begin misalign recompute */ + subs count, count, #16+64 /* finish recompute + pre-bias */ + b.ls L(loop_tail) + + .p2align 6,,24 +L(loop): + stp val, val, [dst, #16] + stp val, val, [dst, #32] + subs count, count, #64 + stp val, val, [dst, #48] + stp val, val, [dst, #64]! + b.hs L(loop) + + adds count, count, #64 /* undo pre-bias */ + b.ne L(loop_tail) + RET + + /* Tail of the zva loop. Less than ZVA bytes, but possibly lots + more than 64. Note that dst is aligned but unbiased. */ +L(zva_tail): + subs count, count, #64 /* pre-bias */ + sub dst, dst, #16 /* pre-bias */ + b.hi L(loop) + + /* Tail of the stp loop; less than 64 bytes left. + Note that dst is still aligned and biased by -16. */ +L(loop_tail): + stp val, val, [dstend, #-64] + stp val, val, [dstend, #-48] + stp val, val, [dstend, #-32] + stp val, val, [dstend, #-16] + RET + + .size memset_nozva, . - memset_nozva + cfi_endproc + +strong_alias (memset, __memset) libc_hidden_builtin_def (memset)