From patchwork Thu Aug 16 15:15:54 2012 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Kirill A. Shutemov" X-Patchwork-Id: 178029 Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Received: from ozlabs.org (localhost [IPv6:::1]) by ozlabs.org (Postfix) with ESMTP id C10372C0185 for ; Fri, 17 Aug 2012 01:17:19 +1000 (EST) Received: from mga11.intel.com (mga11.intel.com [192.55.52.93]) by ozlabs.org (Postfix) with ESMTP id 592052C0145 for ; Fri, 17 Aug 2012 01:16:04 +1000 (EST) Received: from fmsmga001.fm.intel.com ([10.253.24.23]) by fmsmga102.fm.intel.com with ESMTP; 16 Aug 2012 08:16:03 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="4.77,778,1336374000"; d="scan'208";a="202774720" Received: from blue.fi.intel.com ([10.237.72.50]) by fmsmga001.fm.intel.com with ESMTP; 16 Aug 2012 08:15:57 -0700 Received: by blue.fi.intel.com (Postfix, from userid 1000) id 57566E008B; Thu, 16 Aug 2012 18:15:59 +0300 (EEST) From: "Kirill A. Shutemov" To: linux-mm@kvack.org Subject: [PATCH v3 7/7] x86: switch the 64bit uncached page clear to SSE/AVX v2 Date: Thu, 16 Aug 2012 18:15:54 +0300 Message-Id: <1345130154-9602-8-git-send-email-kirill.shutemov@linux.intel.com> X-Mailer: git-send-email 1.7.10.4 In-Reply-To: <1345130154-9602-1-git-send-email-kirill.shutemov@linux.intel.com> References: <1345130154-9602-1-git-send-email-kirill.shutemov@linux.intel.com> Cc: linux-mips@linux-mips.org, linux-sh@vger.kernel.org, Jan Beulich , "H. Peter Anvin" , sparclinux@vger.kernel.org, Andrea Arcangeli , Andi Kleen , Robert Richter , x86@kernel.org, Hugh Dickins , Ingo Molnar , Mel Gorman , Alex Shi , Thomas Gleixner , KAMEZAWA Hiroyuki , Tim Chen , linux-kernel@vger.kernel.org, Andy Lutomirski , Johannes Weiner , Andrew Morton , linuxppc-dev@lists.ozlabs.org, "Kirill A. Shutemov" X-BeenThere: linuxppc-dev@lists.ozlabs.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: Linux on PowerPC Developers Mail List List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Errors-To: linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org Sender: "Linuxppc-dev" From: Andi Kleen With multiple threads vector stores are more efficient, so use them. This will cause the page clear to run non preemptable and add some overhead. However on 32bit it was already non preempable (due to kmap_atomic) and there is an preemption opportunity every 4K unit. On a NPB (Nasa Parallel Benchmark) 128GB run on a Westmere this improves the performance regression of enabling transparent huge pages by ~2% (2.81% to 0.81%), near the runtime variability now. On a system with AVX support more is expected. Signed-off-by: Andi Kleen [kirill.shutemov@linux.intel.com: Properly save/restore arguments] Signed-off-by: Kirill A. Shutemov --- arch/x86/lib/clear_page_64.S | 79 ++++++++++++++++++++++++++++++++++-------- 1 files changed, 64 insertions(+), 15 deletions(-) diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 9d2f3c2..b302cff 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -73,30 +73,79 @@ ENDPROC(clear_page) .Lclear_page_end-clear_page,3b-2b .previous +#define SSE_UNROLL 128 + /* * Zero a page avoiding the caches * rdi page */ ENTRY(clear_page_nocache) CFI_STARTPROC - xorl %eax,%eax - movl $4096/64,%ecx + pushq_cfi %rdi + call kernel_fpu_begin + popq_cfi %rdi + sub $16,%rsp + CFI_ADJUST_CFA_OFFSET 16 + movdqu %xmm0,(%rsp) + xorpd %xmm0,%xmm0 + movl $4096/SSE_UNROLL,%ecx .p2align 4 .Lloop_nocache: decl %ecx -#define PUT(x) movnti %rax,x*8(%rdi) - movnti %rax,(%rdi) - PUT(1) - PUT(2) - PUT(3) - PUT(4) - PUT(5) - PUT(6) - PUT(7) -#undef PUT - leaq 64(%rdi),%rdi + .set x,0 + .rept SSE_UNROLL/16 + movntdq %xmm0,x(%rdi) + .set x,x+16 + .endr + leaq SSE_UNROLL(%rdi),%rdi jnz .Lloop_nocache - nop - ret + movdqu (%rsp),%xmm0 + addq $16,%rsp + CFI_ADJUST_CFA_OFFSET -16 + jmp kernel_fpu_end CFI_ENDPROC ENDPROC(clear_page_nocache) + +#ifdef CONFIG_AS_AVX + + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (clear_page_nocache_avx - clear_page_nocache) - (2f - 1b) + /* offset */ +2: + .previous + .section .altinstructions,"a" + altinstruction_entry clear_page_nocache,1b,X86_FEATURE_AVX,\ + 16, 2b-1b + .previous + +#define AVX_UNROLL 256 /* TUNE ME */ + +ENTRY(clear_page_nocache_avx) + CFI_STARTPROC + pushq_cfi %rdi + call kernel_fpu_begin + popq_cfi %rdi + sub $32,%rsp + CFI_ADJUST_CFA_OFFSET 32 + vmovdqu %ymm0,(%rsp) + vxorpd %ymm0,%ymm0,%ymm0 + movl $4096/AVX_UNROLL,%ecx + .p2align 4 +.Lloop_avx: + decl %ecx + .set x,0 + .rept AVX_UNROLL/32 + vmovntdq %ymm0,x(%rdi) + .set x,x+32 + .endr + leaq AVX_UNROLL(%rdi),%rdi + jnz .Lloop_avx + vmovdqu (%rsp),%ymm0 + addq $32,%rsp + CFI_ADJUST_CFA_OFFSET -32 + jmp kernel_fpu_end + CFI_ENDPROC +ENDPROC(clear_page_nocache_avx) + +#endif