@@ -43,8 +43,10 @@
#endif
#include <linux/compiler.h>
+#include <linux/types.h>
#include <asm/asm-compat.h>
#include <asm/synch.h>
+#include <asm/cputable.h>
/*
* clear_bit doesn't imply a memory barrier
@@ -263,10 +265,130 @@ static __inline__ int fls64(__u64 x)
#endif /* __powerpc64__ */
#ifdef CONFIG_PPC64
-unsigned int __arch_hweight8(unsigned int w);
-unsigned int __arch_hweight16(unsigned int w);
-unsigned int __arch_hweight32(unsigned int w);
-unsigned long __arch_hweight64(__u64 w);
+
+static inline unsigned int __arch_hweight8(unsigned int w)
+{
+ unsigned int register iop asm("r3") = w;
+ unsigned int register tmp asm("r4");
+ __asm__ __volatile__ (
+ stringify_in_c(BEGIN_FTR_SECTION)
+ "bl .__sw_hweight8;"
+ "nop;"
+ stringify_in_c(FTR_SECTION_ELSE)
+ PPC_POPCNTB_M(%1,%2) ";"
+ "clrldi %0,%1,64-8;"
+ stringify_in_c(ALT_FTR_SECTION_END_IFCLR((%3)))
+ : "=r" (iop), "=r" (tmp)
+ : "r" (iop), "i" (CPU_FTR_POPCNTB)
+ : "r0", "r1", "r5", "r6", "r7", "r8", "r9",
+ "r10", "r11", "r12", "r13", "r31", "lr", "cr0", "xer");
+
+ return iop;
+}
+
+static inline unsigned int __arch_hweight16(unsigned int w)
+{
+ unsigned int register iop asm("r3") = w;
+ unsigned int register tmp asm("r4");
+ __asm__ __volatile__ (
+ stringify_in_c(BEGIN_FTR_SECTION)
+ "bl .__sw_hweight16;"
+ "nop;"
+ "nop;"
+ "nop;"
+ "nop;"
+ stringify_in_c(FTR_SECTION_ELSE)
+ stringify_in_c(BEGIN_FTR_SECTION_NESTED(50))
+ PPC_POPCNTB_M(%0,%2) ";"
+ "srdi %1,%0,8;"
+ "add %0,%1,%0;"
+ "clrldi %0,%0,64-8;"
+ stringify_in_c(FTR_SECTION_ELSE_NESTED(50))
+ "clrlwi %0,%2,16;"
+ PPC_POPCNTW_M(%1,%0) ";"
+ "clrldi %0,%1,64-8;"
+ stringify_in_c(ALT_FTR_SECTION_END_NESTED_IFCLR(%4,50))
+ stringify_in_c(ALT_FTR_SECTION_END_IFCLR((%3)))
+ : "=r" (iop), "=r" (tmp)
+ : "r" (iop), "i" (CPU_FTR_POPCNTB), "i" (CPU_FTR_POPCNTD)
+ : "r0", "r1", "r5", "r6", "r7", "r8", "r9",
+ "r10", "r11", "r12", "r13", "r31", "lr", "cr0", "xer");
+
+ return iop;
+}
+
+static inline unsigned int __arch_hweight32(unsigned int w)
+{
+ unsigned int register iop asm("r3") = w;
+ unsigned int register tmp asm("r4");
+ __asm__ __volatile__ (
+ stringify_in_c(BEGIN_FTR_SECTION)
+ "bl .__sw_hweight32;"
+ "nop;"
+ "nop;"
+ "nop;"
+ "nop;"
+ "nop;"
+ "nop;"
+ stringify_in_c(FTR_SECTION_ELSE)
+ stringify_in_c(BEGIN_FTR_SECTION_NESTED(51))
+ PPC_POPCNTB_M(%0,%2) ";"
+ "srdi %1,%0,16;"
+ "add %0,%1,%0;"
+ "srdi %1,%0,8;"
+ "add %0,%1,%0;"
+ "clrldi %0,%0,64-8;"
+ stringify_in_c(FTR_SECTION_ELSE_NESTED(51))
+ PPC_POPCNTW_M(%1,%2) ";"
+ "clrldi %0,%1,64-8;"
+ stringify_in_c(ALT_FTR_SECTION_END_NESTED_IFCLR(%4,51))
+ stringify_in_c(ALT_FTR_SECTION_END_IFCLR((%3)))
+ : "=r" (iop), "=r" (tmp)
+ : "r" (iop), "i" (CPU_FTR_POPCNTB), "i" (CPU_FTR_POPCNTD)
+ : "r0", "r1", "r5", "r6", "r7", "r8", "r9",
+ "r10", "r11", "r12", "r13", "r31", "lr", "cr0", "xer");
+
+ return iop;
+}
+
+static inline __u64 __arch_hweight64(__u64 w)
+{
+ __u64 register iop asm("r3") = w;
+ __u64 register tmp asm("r4");
+ __asm__ __volatile__ (
+ stringify_in_c(BEGIN_FTR_SECTION)
+ "bl .__sw_hweight64;"
+ "nop;"
+ "nop;"
+ "nop;"
+ "nop;"
+ "nop;"
+ "nop;"
+ "nop;"
+ "nop;"
+ stringify_in_c(FTR_SECTION_ELSE)
+ stringify_in_c(BEGIN_FTR_SECTION_NESTED(52))
+ PPC_POPCNTB_M(%0,%2) ";"
+ "srdi %1,%0,32;"
+ "add %0,%1,%0;"
+ "srdi %1,%0,16;"
+ "add %0,%1,%0;"
+ "srdi %1,%0,8;"
+ "add %0,%1,%0;"
+ "clrldi %0,%0,64-8;"
+ stringify_in_c(FTR_SECTION_ELSE_NESTED(52))
+ PPC_POPCNTD_M(%1,%2) ";"
+ "clrldi %0,%1,64-8;"
+ stringify_in_c(ALT_FTR_SECTION_END_NESTED_IFCLR(%4,52))
+ stringify_in_c(ALT_FTR_SECTION_END_IFCLR((%3)))
+ : "=r" (iop), "=r" (tmp)
+ : "r" (iop), "i" (CPU_FTR_POPCNTB), "i" (CPU_FTR_POPCNTD)
+ : "r0", "r1", "r5", "r6", "r7", "r8", "r9",
+ "r10", "r11", "r12", "r13", "r31", "lr", "cr0", "xer");
+
+ return iop;
+}
+
#include <asm-generic/bitops/const_hweight.h>
#else
#include <asm-generic/bitops/hweight.h>
@@ -245,6 +245,12 @@
__PPC_RA(a) | __PPC_RS(s))
#define PPC_POPCNTW(a, s) stringify_in_c(.long PPC_INST_POPCNTW | \
__PPC_RA(a) | __PPC_RS(s))
+#define PPC_POPCNTB_M(a, s) stringify_in_c(.long PPC_INST_POPCNTB | \
+ ___PPC_RA(a) | ___PPC_RS(s))
+#define PPC_POPCNTD_M(a, s) stringify_in_c(.long PPC_INST_POPCNTD | \
+ ___PPC_RA(a) | ___PPC_RS(s))
+#define PPC_POPCNTW_M(a, s) stringify_in_c(.long PPC_INST_POPCNTW | \
+ ___PPC_RA(a) | ___PPC_RS(s))
#define PPC_RFCI stringify_in_c(.long PPC_INST_RFCI)
#define PPC_RFDI stringify_in_c(.long PPC_INST_RFDI)
#define PPC_RFMCI stringify_in_c(.long PPC_INST_RFMCI)
@@ -16,7 +16,7 @@ obj-$(CONFIG_HAS_IOMEM) += devres.o
obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \
memcpy_64.o usercopy_64.o mem_64.o string.o \
- checksum_wrappers_64.o hweight_64.o \
+ checksum_wrappers_64.o \
copyuser_power7.o string_64.o copypage_power7.o \
memcpy_power7.o
obj-$(CONFIG_PPC_EMULATE_SSTEP) += sstep.o ldstfp.o
Patch attempts to improve the performace of __arch_hweight functions by making them inline instead of current out of line implementation. Testcase is to disable/enable SMT on a large (192 thread) POWER7 lpar. Program used for SMT disable/enable is "ppc64_cpu" with "--smt=[off/on]" option. Here are the perf output. In this case, __arch_hweight64 is called by __bitmap_weight. Without patch (ppc64_cpu --smt=off): 17.60% ppc64_cpu [kernel.kallsyms] [k] .deactivate_slab .... 4.85% ppc64_cpu [kernel.kallsyms] [k] .__bitmap_weight .... 1.36% ppc64_cpu [kernel.kallsyms] [k] .__disable_runtime 1.29% ppc64_cpu [kernel.kallsyms] [k] .__arch_hweight64 With patch (ppc64_cpu --smt=off): 17.29% ppc64_cpu [kernel.kallsyms] [k] .deactivate_slab .... 3.71% ppc64_cpu [kernel.kallsyms] [k] .__bitmap_weight 3.26% ppc64_cpu [kernel.kallsyms] [k] .build_overlap_sched_groups .... Without patch (ppc64_cpu --smt=on): 8.35% ppc64_cpu [kernel.kallsyms] [k] .strlen 7.00% ppc64_cpu [kernel.kallsyms] [k] .memset 6.78% ppc64_cpu [kernel.kallsyms] [k] .__bitmap_weight 4.23% ppc64_cpu [kernel.kallsyms] [k] .deactivate_slab .... 1.58% ppc64_cpu [kernel.kallsyms] [k] .refresh_zone_stat_thresholds 1.57% ppc64_cpu [kernel.kallsyms] [k] .__arch_hweight64 1.54% ppc64_cpu [kernel.kallsyms] [k] .__enable_runtime .... With patch (ppc64_cpu --smt=on): 9.44% ppc64_cpu [kernel.kallsyms] [k] .strlen 6.43% ppc64_cpu [kernel.kallsyms] [k] .memset 5.48% ppc64_cpu [kernel.kallsyms] [k] .__bitmap_weight 4.59% ppc64_cpu [kernel.kallsyms] [k] .insert_entry 4.29% ppc64_cpu [kernel.kallsyms] [k] .deactivate_slab .... Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com> --- arch/powerpc/include/asm/bitops.h | 130 ++++++++++++++++++++++++++++++++- arch/powerpc/include/asm/ppc-opcode.h | 6 ++ arch/powerpc/lib/Makefile | 2 +- 3 files changed, 133 insertions(+), 5 deletions(-)