@@ -51,11 +51,21 @@ static __inline__ void clear_page(void *addr)
__asm__ __volatile__(
"mtctr %1 # clear_page\n\
-1: dcbz 0,%0\n\
- add %0,%0,%3\n\
+ .balign 16\n\
+1: dcbz 0,%0\n\
+ dcbz %3,%0\n\
+ dcbz %4,%0\n\
+ dcbz %5,%0\n\
+ dcbz %6,%0\n\
+ dcbz %7,%0\n\
+ dcbz %8,%0\n\
+ dcbz %9,%0\n\
+ add %0,%0,%10\n\
bdnz+ 1b"
- : "=r" (addr)
- : "r" (lines), "0" (addr), "r" (line_size)
+ : "=&r" (addr)
+ : "r" (lines/8), "0" (addr), "b" (line_size), "b" (line_size*2),
+ "b" (line_size*3), "b" (line_size*4), "b" (line_size*5),
+ "b" (line_size*6), "b" (line_size*7), "r" (line_size*8)
: "ctr", "memory");
}
Unroll clear_page 8 times. A simple microbenchmark which allocates and frees a zeroed page: for (i = 0; i < iterations; i++) { unsigned long p = __get_free_page(GFP_KERNEL | __GFP_ZERO); free_page(p); } improves 20% on POWER8. This assumes cacheline sizes won't grow beyond 512 bytes and page sizes wont drop below 1kB, which is unlikely, but we could add a runtime check during early init if it makes people nervous. Signed-off-by: Anton Blanchard <anton@samba.org> --- arch/powerpc/include/asm/page_64.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-)