From patchwork Sat Oct 6 04:23:01 2012 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: David Miller X-Patchwork-Id: 189662 X-Patchwork-Delegate: davem@davemloft.net Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 40E862C020B for ; Sat, 6 Oct 2012 14:23:11 +1000 (EST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751102Ab2JFEXG (ORCPT ); Sat, 6 Oct 2012 00:23:06 -0400 Received: from shards.monkeyblade.net ([149.20.54.216]:33684 "EHLO shards.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750951Ab2JFEXE (ORCPT ); Sat, 6 Oct 2012 00:23:04 -0400 Received: from localhost (cpe-66-108-116-58.nyc.res.rr.com [66.108.116.58]) by shards.monkeyblade.net (Postfix) with ESMTPSA id 4EDB25898CC for ; Fri, 5 Oct 2012 21:23:06 -0700 (PDT) Date: Sat, 06 Oct 2012 00:23:01 -0400 (EDT) Message-Id: <20121006.002301.1232666623245788670.davem@davemloft.net> To: sparclinux@vger.kernel.org Subject: [PATCH 1/2] sparc64: Niagara-4 bzero/memset, plus use MRU stores in page copy. From: David Miller X-Mailer: Mew version 6.5 on Emacs 24.1 / Mule 6.0 (HANACHIRUSATO) Mime-Version: 1.0 Sender: sparclinux-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: sparclinux@vger.kernel.org This adds optimized memset/bzero/page-clear routines for Niagara-4. We basically can do what powerpc has been able to do for a decade (via the "dcbz" instruction), which is use cache line clearing stores for bzero and memsets with a 'c' argument of zero. As long as we make the cache initializing store to each 32-byte subblock of the L2 cache line, it works. As with other Niagara-4 optimized routines, the key is to make sure to avoid any usage of the %asi register, as reads and writes to it cost at least 50 cycles. For the user clear cases, we don't use these new routines, we use the Niagara-1 variants instead. Those have to use %asi in an unavoidable way. A Niagara-4 8K page clear costs just under 600 cycles. Add definitions of the MRU variants of the cache initializing store ASIs. By default, cache initializing stores install the line as Least Recently Used. If we know we're going to use the data immediately (which is true for page copies and clears) we can use the Most Recently Used variant, to decrease the likelyhood of the lines being evicted before they get used. Signed-off-by: David S. Miller --- arch/sparc/include/asm/asi.h | 19 ++++++++ arch/sparc/kernel/head_64.S | 2 +- arch/sparc/lib/Makefile | 2 +- arch/sparc/lib/NG4clear_page.S | 29 +++++++++++ arch/sparc/lib/NG4copy_page.S | 16 +++--- arch/sparc/lib/NG4memset.S | 105 ++++++++++++++++++++++++++++++++++++++++ arch/sparc/lib/NG4patch.S | 15 +++++- 7 files changed, 176 insertions(+), 12 deletions(-) create mode 100644 arch/sparc/lib/NG4clear_page.S create mode 100644 arch/sparc/lib/NG4memset.S diff --git a/arch/sparc/include/asm/asi.h b/arch/sparc/include/asm/asi.h index cc0006d..aace6f3 100644 --- a/arch/sparc/include/asm/asi.h +++ b/arch/sparc/include/asm/asi.h @@ -270,9 +270,28 @@ #define ASI_BLK_INIT_QUAD_LDD_P 0xe2 /* (NG) init-store, twin load, * primary, implicit */ +#define ASI_BLK_INIT_QUAD_LDD_S 0xe3 /* (NG) init-store, twin load, + * secondary, implicit + */ #define ASI_BLK_P 0xf0 /* Primary, blk ld/st */ #define ASI_BLK_S 0xf1 /* Secondary, blk ld/st */ +#define ASI_ST_BLKINIT_MRU_P 0xf2 /* (NG4) init-store, twin load, + * Most-Recently-Used, primary, + * implicit + */ +#define ASI_ST_BLKINIT_MRU_S 0xf2 /* (NG4) init-store, twin load, + * Most-Recently-Used, secondary, + * implicit + */ #define ASI_BLK_PL 0xf8 /* Primary, blk ld/st, little */ #define ASI_BLK_SL 0xf9 /* Secondary, blk ld/st, little */ +#define ASI_ST_BLKINIT_MRU_PL 0xfa /* (NG4) init-store, twin load, + * Most-Recently-Used, primary, + * implicit, little-endian + */ +#define ASI_ST_BLKINIT_MRU_SL 0xfb /* (NG4) init-store, twin load, + * Most-Recently-Used, secondary, + * implicit, little-endian + */ #endif /* _SPARC_ASI_H */ diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S index ee5dcce..2feb15c 100644 --- a/arch/sparc/kernel/head_64.S +++ b/arch/sparc/kernel/head_64.S @@ -576,7 +576,7 @@ niagara_tlb_fixup: niagara4_patch: call niagara4_patch_copyops nop - call niagara_patch_bzero + call niagara4_patch_bzero nop call niagara4_patch_pageops nop diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index 30f6ab5..8410065f2 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -33,7 +33,7 @@ lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o lib-$(CONFIG_SPARC64) += NG2patch.o lib-$(CONFIG_SPARC64) += NG4memcpy.o NG4copy_from_user.o NG4copy_to_user.o -lib-$(CONFIG_SPARC64) += NG4patch.o NG4copy_page.o +lib-$(CONFIG_SPARC64) += NG4patch.o NG4copy_page.o NG4clear_page.o NG4memset.o lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o diff --git a/arch/sparc/lib/NG4clear_page.S b/arch/sparc/lib/NG4clear_page.S new file mode 100644 index 0000000..e16c882 --- /dev/null +++ b/arch/sparc/lib/NG4clear_page.S @@ -0,0 +1,29 @@ +/* NG4copy_page.S: Niagara-4 optimized clear page. + * + * Copyright (C) 2012 (davem@davemloft.net) + */ + +#include +#include + + .text + + .register %g3, #scratch + + .align 32 + .globl NG4clear_page + .globl NG4clear_user_page +NG4clear_page: /* %o0=dest */ +NG4clear_user_page: /* %o0=dest, %o1=vaddr */ + set PAGE_SIZE, %g7 + mov 0x20, %g3 +1: stxa %g0, [%o0 + %g0] ASI_ST_BLKINIT_MRU_P + subcc %g7, 0x40, %g7 + stxa %g0, [%o0 + %g3] ASI_ST_BLKINIT_MRU_P + bne,pt %xcc, 1b + add %o0, 0x40, %o0 + membar #StoreLoad|#StoreStore + retl + nop + .size NG4clear_page,.-NG4clear_page + .size NG4clear_user_page,.-NG4clear_user_page \ No newline at end of file diff --git a/arch/sparc/lib/NG4copy_page.S b/arch/sparc/lib/NG4copy_page.S index f30ec10..28504e8 100644 --- a/arch/sparc/lib/NG4copy_page.S +++ b/arch/sparc/lib/NG4copy_page.S @@ -30,25 +30,25 @@ NG4copy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ ldx [%o1 + 0x10], %o4 ldx [%o1 + 0x18], %o5 ldx [%o1 + 0x20], %g1 - stxa %o2, [%o0] ASI_BLK_INIT_QUAD_LDD_P + stxa %o2, [%o0] ASI_ST_BLKINIT_MRU_P add %o0, 0x08, %o0 ldx [%o1 + 0x28], %g2 - stxa %o3, [%o0] ASI_BLK_INIT_QUAD_LDD_P + stxa %o3, [%o0] ASI_ST_BLKINIT_MRU_P add %o0, 0x08, %o0 ldx [%o1 + 0x30], %g3 - stxa %o4, [%o0] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0] ASI_ST_BLKINIT_MRU_P add %o0, 0x08, %o0 ldx [%o1 + 0x38], %o2 add %o1, 0x40, %o1 - stxa %o5, [%o0] ASI_BLK_INIT_QUAD_LDD_P + stxa %o5, [%o0] ASI_ST_BLKINIT_MRU_P add %o0, 0x08, %o0 - stxa %g1, [%o0] ASI_BLK_INIT_QUAD_LDD_P + stxa %g1, [%o0] ASI_ST_BLKINIT_MRU_P add %o0, 0x08, %o0 - stxa %g2, [%o0] ASI_BLK_INIT_QUAD_LDD_P + stxa %g2, [%o0] ASI_ST_BLKINIT_MRU_P add %o0, 0x08, %o0 - stxa %g3, [%o0] ASI_BLK_INIT_QUAD_LDD_P + stxa %g3, [%o0] ASI_ST_BLKINIT_MRU_P add %o0, 0x08, %o0 - stxa %o2, [%o0] ASI_BLK_INIT_QUAD_LDD_P + stxa %o2, [%o0] ASI_ST_BLKINIT_MRU_P add %o0, 0x08, %o0 bne,pt %icc, 1b prefetch [%o1 + 0x200], #n_reads_strong diff --git a/arch/sparc/lib/NG4memset.S b/arch/sparc/lib/NG4memset.S new file mode 100644 index 0000000..41da4bd --- /dev/null +++ b/arch/sparc/lib/NG4memset.S @@ -0,0 +1,105 @@ +/* NG4memset.S: Niagara-4 optimized memset/bzero. + * + * Copyright (C) 2012 David S. Miller (davem@davemloft.net) + */ + +#include + + .register %g2, #scratch + .register %g3, #scratch + + .text + .align 32 + .globl NG4memset +NG4memset: + andcc %o1, 0xff, %o4 + be,pt %icc, 1f + mov %o2, %o1 + sllx %o4, 8, %g1 + or %g1, %o4, %o2 + sllx %o2, 16, %g1 + or %g1, %o2, %o2 + sllx %o2, 32, %g1 + ba,pt %icc, 1f + or %g1, %o2, %o4 + .size NG4memset,.-NG4memset + + .align 32 + .globl NG4bzero +NG4bzero: + clr %o4 +1: cmp %o1, 16 + ble %icc, .Ltiny + mov %o0, %o3 + sub %g0, %o0, %g1 + and %g1, 0x7, %g1 + brz,pt %g1, .Laligned8 + sub %o1, %g1, %o1 +1: stb %o4, [%o0 + 0x00] + subcc %g1, 1, %g1 + bne,pt %icc, 1b + add %o0, 1, %o0 +.Laligned8: + cmp %o1, 64 + (64 - 8) + ble .Lmedium + sub %g0, %o0, %g1 + andcc %g1, (64 - 1), %g1 + brz,pn %g1, .Laligned64 + sub %o1, %g1, %o1 +1: stx %o4, [%o0 + 0x00] + subcc %g1, 8, %g1 + bne,pt %icc, 1b + add %o0, 0x8, %o0 +.Laligned64: + andn %o1, 64 - 1, %g1 + sub %o1, %g1, %o1 + brnz,pn %o4, .Lnon_bzero_loop + mov 0x20, %g2 +1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P + subcc %g1, 0x40, %g1 + stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P + bne,pt %icc, 1b + add %o0, 0x40, %o0 +.Lpostloop: + cmp %o1, 8 + bl,pn %icc, .Ltiny + membar #StoreStore|#StoreLoad +.Lmedium: + andn %o1, 0x7, %g1 + sub %o1, %g1, %o1 +1: stx %o4, [%o0 + 0x00] + subcc %g1, 0x8, %g1 + bne,pt %icc, 1b + add %o0, 0x08, %o0 + andcc %o1, 0x4, %g1 + be,pt %icc, .Ltiny + sub %o1, %g1, %o1 + stw %o4, [%o0 + 0x00] + add %o0, 0x4, %o0 +.Ltiny: + cmp %o1, 0 + be,pn %icc, .Lexit +1: subcc %o1, 1, %o1 + stb %o4, [%o0 + 0x00] + bne,pt %icc, 1b + add %o0, 1, %o0 +.Lexit: + retl + mov %o3, %o0 +.Lnon_bzero_loop: + mov 0x08, %g3 + mov 0x28, %o5 +1: stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P + subcc %g1, 0x40, %g1 + stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P + add %o0, 0x10, %o0 + stxa %o4, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P + stxa %o4, [%o0 + %o5] ASI_BLK_INIT_QUAD_LDD_P + bne,pt %icc, 1b + add %o0, 0x30, %o0 + ba,a,pt %icc, .Lpostloop + .size NG4bzero,.-NG4bzero diff --git a/arch/sparc/lib/NG4patch.S b/arch/sparc/lib/NG4patch.S index c21c34c..a114cbc 100644 --- a/arch/sparc/lib/NG4patch.S +++ b/arch/sparc/lib/NG4patch.S @@ -32,12 +32,23 @@ niagara4_patch_copyops: nop .size niagara4_patch_copyops,.-niagara4_patch_copyops + .globl niagara4_patch_bzero + .type niagara4_patch_bzero,#function +niagara4_patch_bzero: + NG_DO_PATCH(memset, NG4memset) + NG_DO_PATCH(__bzero, NG4bzero) + NG_DO_PATCH(__clear_user, NGclear_user) + NG_DO_PATCH(tsb_init, NGtsb_init) + retl + nop + .size niagara4_patch_bzero,.-niagara4_patch_bzero + .globl niagara4_patch_pageops .type niagara4_patch_pageops,#function niagara4_patch_pageops: NG_DO_PATCH(copy_user_page, NG4copy_user_page) - NG_DO_PATCH(_clear_page, NGclear_page) - NG_DO_PATCH(clear_user_page, NGclear_user_page) + NG_DO_PATCH(_clear_page, NG4clear_page) + NG_DO_PATCH(clear_user_page, NG4clear_user_page) retl nop .size niagara4_patch_pageops,.-niagara4_patch_pageops