Message ID | 1505950480-14830-3-git-send-email-wei.guo.simon@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | powerpc/64: memcmp() optimization | expand |
Hi, On Thu, Sep 21, 2017 at 07:34:39AM +0800, wei.guo.simon@gmail.com wrote: > From: Simon Guo <wei.guo.simon@gmail.com> > > This patch add VMX primitives to do memcmp() in case the compare size > exceeds 4K bytes. > > Test result with following test program(replace the "^>" with ""): > ------ I missed the exit_vmx_ops() part and need to rework on v3. Thanks, - Simon
On Thu, 2017-09-21 at 07:34 +0800, wei.guo.simon@gmail.com wrote: > From: Simon Guo <wei.guo.simon@gmail.com> > > This patch add VMX primitives to do memcmp() in case the compare size > exceeds 4K bytes. > Hi Simon, Sorry I didn't see this sooner, I've actually been working on a kernel version of glibc commit dec4a7105e (powerpc: Improve memcmp performance for POWER8) unfortunately I've been distracted and it still isn't done. I wonder if we can consolidate our efforts here. One thing I did come across in my testing is that for memcmp() that will fail early (I haven't narrowed down the the optimal number yet) the cost of enabling VMX actually turns out to be a performance regression, as such I've added a small check of the first 64 bytes to the start before enabling VMX to ensure the penalty is worth taking. Also, you should consider doing 4K and greater, KSM (Kernel Samepage Merging) uses PAGE_SIZE which can be as small as 4K. Cyril > Test result with following test program(replace the "^>" with ""): > ------ > > # cat tools/testing/selftests/powerpc/stringloops/memcmp.c > > #include <malloc.h> > > #include <stdlib.h> > > #include <string.h> > > #include <time.h> > > #include "utils.h" > > #define SIZE (1024 * 1024 * 900) > > #define ITERATIONS 40 > > int test_memcmp(const void *s1, const void *s2, size_t n); > > static int testcase(void) > { > char *s1; > char *s2; > unsigned long i; > > s1 = memalign(128, SIZE); > if (!s1) { > perror("memalign"); > exit(1); > } > > s2 = memalign(128, SIZE); > if (!s2) { > perror("memalign"); > exit(1); > } > > for (i = 0; i < SIZE; i++) { > s1[i] = i & 0xff; > s2[i] = i & 0xff; > } > for (i = 0; i < ITERATIONS; i++) { > int ret = test_memcmp(s1, s2, SIZE); > > if (ret) { > printf("return %d at[%ld]! should have returned zero\n", ret, i); > abort(); > } > } > > return 0; > } > > int main(void) > { > return test_harness(testcase, "memcmp"); > } > ------ > Without VMX patch: > 7.435191479 seconds time elapsed ( +- 0.51% ) > With VMX patch: > 6.802038938 seconds time elapsed ( +- 0.56% ) > There is ~+8% improvement. > > However I am not aware whether there is use case in kernel for memcmp on > large size yet. > > Signed-off-by: Simon Guo <wei.guo.simon@gmail.com> > --- > arch/powerpc/include/asm/asm-prototypes.h | 2 +- > arch/powerpc/lib/copypage_power7.S | 2 +- > arch/powerpc/lib/memcmp_64.S | 82 +++++++++++++++++++++++++++++++ > arch/powerpc/lib/memcpy_power7.S | 2 +- > arch/powerpc/lib/vmx-helper.c | 2 +- > 5 files changed, 86 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h > index 7330150..e6530d8 100644 > --- a/arch/powerpc/include/asm/asm-prototypes.h > +++ b/arch/powerpc/include/asm/asm-prototypes.h > @@ -49,7 +49,7 @@ void __trace_hcall_exit(long opcode, unsigned long retval, > /* VMX copying */ > int enter_vmx_usercopy(void); > int exit_vmx_usercopy(void); > -int enter_vmx_copy(void); > +int enter_vmx_ops(void); > void * exit_vmx_copy(void *dest); > > /* Traps */ > diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S > index ca5fc8f..9e7729e 100644 > --- a/arch/powerpc/lib/copypage_power7.S > +++ b/arch/powerpc/lib/copypage_power7.S > @@ -60,7 +60,7 @@ _GLOBAL(copypage_power7) > std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) > std r0,16(r1) > stdu r1,-STACKFRAMESIZE(r1) > - bl enter_vmx_copy > + bl enter_vmx_ops > cmpwi r3,0 > ld r0,STACKFRAMESIZE+16(r1) > ld r3,STK_REG(R31)(r1) > diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S > index 6dccfb8..40218fc 100644 > --- a/arch/powerpc/lib/memcmp_64.S > +++ b/arch/powerpc/lib/memcmp_64.S > @@ -162,6 +162,13 @@ _GLOBAL(memcmp) > blr > > .Llong: > +#ifdef CONFIG_ALTIVEC > + /* Try to use vmx loop if length is larger than 4K */ > + cmpldi cr6,r5,4096 > + bgt cr6,.Lvmx_cmp > + > +.Llong_novmx_cmp: > +#endif > li off8,8 > li off16,16 > li off24,24 > @@ -319,4 +326,79 @@ _GLOBAL(memcmp) > 8: > blr > > +#ifdef CONFIG_ALTIVEC > +.Lvmx_cmp: > + mflr r0 > + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) > + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) > + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) > + std r0,16(r1) > + stdu r1,-STACKFRAMESIZE(r1) > + bl enter_vmx_ops > + cmpwi cr1,r3,0 > + ld r0,STACKFRAMESIZE+16(r1) > + ld r3,STK_REG(R31)(r1) > + ld r4,STK_REG(R30)(r1) > + ld r5,STK_REG(R29)(r1) > + addi r1,r1,STACKFRAMESIZE > + mtlr r0 > + beq cr1,.Llong_novmx_cmp > + > +3: > + /* Enter with src/dst address 8 bytes aligned, and len is > + * no less than 4KB. Need to align with 16 bytes further. > + */ > + andi. rA,r3,8 > + beq 4f > + LD rA,0,r3 > + LD rB,0,r4 > + cmpld cr0,rA,rB > + bne cr0,.LcmpAB_lightweight > + > + addi r3,r3,8 > + addi r4,r4,8 > + addi r5,r5,-8 > + > +4: > + /* compare 32 bytes for each loop */ > + srdi r0,r5,5 > + mtctr r0 > + andi. r5,r5,31 > + li off16,16 > + > +.balign 16 > +5: > + lvx v0,0,r3 > + lvx v1,0,r4 > + vcmpequd. v0,v0,v1 > + bf 24,7f > + lvx v0,off16,r3 > + lvx v1,off16,r4 > + vcmpequd. v0,v0,v1 > + bf 24,6f > + addi r3,r3,32 > + addi r4,r4,32 > + bdnz 5b > + > + cmpdi r5,0 > + beq .Lzero > + b .L8bytes_aligned > + > +6: > + addi r3,r3,16 > + addi r4,r4,16 > + > +7: > + LD rA,0,r3 > + LD rB,0,r4 > + cmpld cr0,rA,rB > + bne cr0,.LcmpAB_lightweight > + > + li off8,8 > + LD rA,off8,r3 > + LD rB,off8,r4 > + cmpld cr0,rA,rB > + bne cr0,.LcmpAB_lightweight > + b .Lzero > +#endif > EXPORT_SYMBOL(memcmp) > diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S > index 193909a..682e386 100644 > --- a/arch/powerpc/lib/memcpy_power7.S > +++ b/arch/powerpc/lib/memcpy_power7.S > @@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7) > std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) > std r0,16(r1) > stdu r1,-STACKFRAMESIZE(r1) > - bl enter_vmx_copy > + bl enter_vmx_ops > cmpwi cr1,r3,0 > ld r0,STACKFRAMESIZE+16(r1) > ld r3,STK_REG(R31)(r1) > diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c > index bf925cd..923a9ab 100644 > --- a/arch/powerpc/lib/vmx-helper.c > +++ b/arch/powerpc/lib/vmx-helper.c > @@ -53,7 +53,7 @@ int exit_vmx_usercopy(void) > return 0; > } > > -int enter_vmx_copy(void) > +int enter_vmx_ops(void) > { > if (in_interrupt()) > return 0;
Hi Cyril, On Sat, Sep 23, 2017 at 12:06:48AM +1000, Cyril Bur wrote: > On Thu, 2017-09-21 at 07:34 +0800, wei.guo.simon@gmail.com wrote: > > From: Simon Guo <wei.guo.simon@gmail.com> > > > > This patch add VMX primitives to do memcmp() in case the compare size > > exceeds 4K bytes. > > > > Hi Simon, > > Sorry I didn't see this sooner, I've actually been working on a kernel > version of glibc commit dec4a7105e (powerpc: Improve memcmp performance > for POWER8) unfortunately I've been distracted and it still isn't done. Thanks for sync with me. Let's consolidate our effort together :) I have a quick check on glibc commit dec4a7105e. Looks the aligned case comparison with VSX is launched without rN size limitation, which means it will have a VSX reg load penalty even when the length is 9 bytes. It did some optimization when src/dest addrs don't have the same offset on 8 bytes alignment boundary. I need to read more closely. > I wonder if we can consolidate our efforts here. One thing I did come > across in my testing is that for memcmp() that will fail early (I > haven't narrowed down the the optimal number yet) the cost of enabling > VMX actually turns out to be a performance regression, as such I've > added a small check of the first 64 bytes to the start before enabling > VMX to ensure the penalty is worth taking. Will there still be a penalty if the 65th byte differs? > > Also, you should consider doing 4K and greater, KSM (Kernel Samepage > Merging) uses PAGE_SIZE which can be as small as 4K. Currently the VMX will only be applied when size exceeds 4K. Are you suggesting a bigger threshold than 4K? We can sync more offline for v3. Thanks, - Simon
On Sun, 2017-09-24 at 05:18 +0800, Simon Guo wrote: > Hi Cyril, > On Sat, Sep 23, 2017 at 12:06:48AM +1000, Cyril Bur wrote: > > On Thu, 2017-09-21 at 07:34 +0800, wei.guo.simon@gmail.com wrote: > > > From: Simon Guo <wei.guo.simon@gmail.com> > > > > > > This patch add VMX primitives to do memcmp() in case the compare size > > > exceeds 4K bytes. > > > > > > > Hi Simon, > > > > Sorry I didn't see this sooner, I've actually been working on a kernel > > version of glibc commit dec4a7105e (powerpc: Improve memcmp performance > > for POWER8) unfortunately I've been distracted and it still isn't done. > > Thanks for sync with me. Let's consolidate our effort together :) > > I have a quick check on glibc commit dec4a7105e. > Looks the aligned case comparison with VSX is launched without rN size > limitation, which means it will have a VSX reg load penalty even when the > length is 9 bytes. > This was written for userspace which doesn't have to explicitly enable VMX in order to use it - we need to be smarter in the kernel. > It did some optimization when src/dest addrs don't have the same offset > on 8 bytes alignment boundary. I need to read more closely. > > > I wonder if we can consolidate our efforts here. One thing I did come > > across in my testing is that for memcmp() that will fail early (I > > haven't narrowed down the the optimal number yet) the cost of enabling > > VMX actually turns out to be a performance regression, as such I've > > added a small check of the first 64 bytes to the start before enabling > > VMX to ensure the penalty is worth taking. > > Will there still be a penalty if the 65th byte differs? > I haven't benchmarked it exactly, my rationale for 64 bytes was that it is the stride of the vectorised copy loop so, if we know we'll fail before even completing one iteration of the vectorized loop there isn't any point using the vector regs. > > > > Also, you should consider doing 4K and greater, KSM (Kernel Samepage > > Merging) uses PAGE_SIZE which can be as small as 4K. > > Currently the VMX will only be applied when size exceeds 4K. Are you > suggesting a bigger threshold than 4K? > Equal to or greater than 4K, KSM will benefit. > We can sync more offline for v3. > > Thanks, > - Simon
Cyril Bur <cyrilbur@gmail.com> writes: > On Sun, 2017-09-24 at 05:18 +0800, Simon Guo wrote: >> Hi Cyril, >> On Sat, Sep 23, 2017 at 12:06:48AM +1000, Cyril Bur wrote: >> > On Thu, 2017-09-21 at 07:34 +0800, wei.guo.simon@gmail.com wrote: >> > > From: Simon Guo <wei.guo.simon@gmail.com> >> > > >> > > This patch add VMX primitives to do memcmp() in case the compare size >> > > exceeds 4K bytes. >> > >> > Sorry I didn't see this sooner, I've actually been working on a kernel >> > version of glibc commit dec4a7105e (powerpc: Improve memcmp performance >> > for POWER8) unfortunately I've been distracted and it still isn't done. >> >> Thanks for sync with me. Let's consolidate our effort together :) >> >> I have a quick check on glibc commit dec4a7105e. >> Looks the aligned case comparison with VSX is launched without rN size >> limitation, which means it will have a VSX reg load penalty even when the >> length is 9 bytes. >> > > This was written for userspace which doesn't have to explicitly enable > VMX in order to use it - we need to be smarter in the kernel. Well the kernel has to do it for them after a trap, which is actually even more expensive, so arguably the glibc code should be smarter too and the threshold before using VMX should probably be higher than in the kernel (to cover the cost of the trap). But I digress :) cheers
On Tue, Sep 26, 2017 at 03:34:36PM +1000, Michael Ellerman wrote: > Cyril Bur <cyrilbur@gmail.com> writes: > > This was written for userspace which doesn't have to explicitly enable > > VMX in order to use it - we need to be smarter in the kernel. > > Well the kernel has to do it for them after a trap, which is actually > even more expensive, so arguably the glibc code should be smarter too > and the threshold before using VMX should probably be higher than in the > kernel (to cover the cost of the trap). A lot of userspace code uses V*X, more and more with newer CPUs and newer compiler versions. If you already paid the price for using vector registers you do not need to again :-) > But I digress :) Yeah sorry :-) Segher
Segher Boessenkool <segher@kernel.crashing.org> writes: > On Tue, Sep 26, 2017 at 03:34:36PM +1000, Michael Ellerman wrote: >> Cyril Bur <cyrilbur@gmail.com> writes: >> > This was written for userspace which doesn't have to explicitly enable >> > VMX in order to use it - we need to be smarter in the kernel. >> >> Well the kernel has to do it for them after a trap, which is actually >> even more expensive, so arguably the glibc code should be smarter too >> and the threshold before using VMX should probably be higher than in the >> kernel (to cover the cost of the trap). > > A lot of userspace code uses V*X, more and more with newer CPUs and newer > compiler versions. If you already paid the price for using vector > registers you do not need to again :-) True, but you don't know if you've paid the price already. You also pay the price on every context switch (more state to switch), so it's not free even once enabled. Which is why the kernel will eventually turn it off if it's unused again. But now that I've actually looked at the glibc version, it does do some checks for minimum length before doing any vector instructions, so that's probably all we want. The exact trade off between checking some bytes without vector vs turning on vector depends on your input data, so it's tricky to tune in general. >> But I digress :) > > Yeah sorry :-) :) cheers
On Wed, Sep 27, 2017 at 01:38:09PM +1000, Michael Ellerman wrote: > Segher Boessenkool <segher@kernel.crashing.org> writes: > > A lot of userspace code uses V*X, more and more with newer CPUs and newer > > compiler versions. If you already paid the price for using vector > > registers you do not need to again :-) > > True, but you don't know if you've paid the price already. > > You also pay the price on every context switch (more state to switch), > so it's not free even once enabled. Which is why the kernel will > eventually turn it off if it's unused again. Yup. But my point is that because user space code uses vector registers more and more, the penalty for user space code to use vector registers even more keeps shrinking. > But now that I've actually looked at the glibc version, it does do some > checks for minimum length before doing any vector instructions, so > that's probably all we want. The exact trade off between checking some > bytes without vector vs turning on vector depends on your input data, so > it's tricky to tune in general. You also need nasty code to deal with the start and end of strings, with conditional branches and whatnot, which quickly overwhelms the benefit of using vector registers at all. This tradeoff also changes with newer ISA versions. Things have to become *really* cheap before it will be good to often use vector registers in the kernel though. Segher
From: Segher Boessenkool > Sent: 27 September 2017 10:28 ... > You also need nasty code to deal with the start and end of strings, with > conditional branches and whatnot, which quickly overwhelms the benefit > of using vector registers at all. This tradeoff also changes with newer > ISA versions. The goal posts keep moving. For instance with modern intel x86 cpus 'rep movsb' is by far the fastest way to copy data (from cached memory). > Things have to become *really* cheap before it will be good to often use > vector registers in the kernel though. I've had thoughts about this in the past. If the vector registers belong to the current process then you might get away with just saving the ones you want to use. If they belong to a different process then you also need to tell the FPU save code where you've saved the registers. Then the IPI code can recover all the correct values. On X86 all the AVX registers are caller saved, the system call entry could issue the instruction that invalidates them all. Kernel code running in the context of a user process could then use the registers without saving them. It would only need to set a mark to ensure they are invalidated again on return to user (might be cheap enough to do anyway). Dunno about PPC though. David
Hi Michael, On Wed, Sep 27, 2017 at 01:38:09PM +1000, Michael Ellerman wrote: > Segher Boessenkool <segher@kernel.crashing.org> writes: > > > On Tue, Sep 26, 2017 at 03:34:36PM +1000, Michael Ellerman wrote: > >> Cyril Bur <cyrilbur@gmail.com> writes: > >> > This was written for userspace which doesn't have to explicitly enable > >> > VMX in order to use it - we need to be smarter in the kernel. > >> > >> Well the kernel has to do it for them after a trap, which is actually > >> even more expensive, so arguably the glibc code should be smarter too > >> and the threshold before using VMX should probably be higher than in the > >> kernel (to cover the cost of the trap). > > > > A lot of userspace code uses V*X, more and more with newer CPUs and newer > > compiler versions. If you already paid the price for using vector > > registers you do not need to again :-) > > True, but you don't know if you've paid the price already. > > You also pay the price on every context switch (more state to switch), > so it's not free even once enabled. Which is why the kernel will > eventually turn it off if it's unused again. > > But now that I've actually looked at the glibc version, it does do some > checks for minimum length before doing any vector instructions, so Looks the glibc version will use VSX instruction and lead to trap in a 9 bytes size cmp with src/dst 16 bytes aligned. 132 /* Now both rSTR1 and rSTR2 are aligned to QW. */ 133 .align 4 134 L(qw_align): 135 vspltisb v0, 0 136 srdi. r6, rN, 6 137 li r8, 16 138 li r10, 32 139 li r11, 48 140 ble cr0, L(lessthan64) 141 mtctr r6 142 vspltisb v8, 0 143 vspltisb v6, 0 144 /* Aligned vector loop. */ 145 .align 4 line 135 is the VSX instruction causing trap. Did I miss anything? > that's probably all we want. The exact trade off between checking some > bytes without vector vs turning on vector depends on your input data, so > it's tricky to tune in general. Discussed offline with Cyril. The plan is to use (>=4KB) as the minimum len before vector regs steps at v3. Cyril will consolidate his existing work on KSM optimization later, which is probably making 64bytes comparison-ahead to determine whether it is an early or late matching pattern. Cyril has also some other valuable comments and I will rework on v3. Is it OK for you? Thanks, - Simon
On Wed, Sep 27, 2017 at 09:43:44AM +0000, David Laight wrote: > From: Segher Boessenkool > > Sent: 27 September 2017 10:28 > ... > > You also need nasty code to deal with the start and end of strings, with > > conditional branches and whatnot, which quickly overwhelms the benefit > > of using vector registers at all. This tradeoff also changes with newer > > ISA versions. > > The goal posts keep moving. > For instance with modern intel x86 cpus 'rep movsb' is by far the fastest > way to copy data (from cached memory). > > > Things have to become *really* cheap before it will be good to often use > > vector registers in the kernel though. > > I've had thoughts about this in the past. > If the vector registers belong to the current process then you might > get away with just saving the ones you want to use. > If they belong to a different process then you also need to tell the > FPU save code where you've saved the registers. > Then the IPI code can recover all the correct values. > > On X86 all the AVX registers are caller saved, the system call > entry could issue the instruction that invalidates them all. > Kernel code running in the context of a user process could then > use the registers without saving them. > It would only need to set a mark to ensure they are invalidated > again on return to user (might be cheap enough to do anyway). > Dunno about PPC though. I am not aware of any ppc instruction which can set a "mark" or provide any high granularity flag against single or subgroup of vec regs' validity. But ppc experts may want to correct me. Thanks, - Simon
From: Simon Guo > Sent: 27 September 2017 19:34 ... > > On X86 all the AVX registers are caller saved, the system call > > entry could issue the instruction that invalidates them all. > > Kernel code running in the context of a user process could then > > use the registers without saving them. > > It would only need to set a mark to ensure they are invalidated > > again on return to user (might be cheap enough to do anyway). > > Dunno about PPC though. > > I am not aware of any ppc instruction which can set a "mark" or provide > any high granularity flag against single or subgroup of vec regs' validity. > But ppc experts may want to correct me. I was just thinking of a software flag. David
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 7330150..e6530d8 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -49,7 +49,7 @@ void __trace_hcall_exit(long opcode, unsigned long retval, /* VMX copying */ int enter_vmx_usercopy(void); int exit_vmx_usercopy(void); -int enter_vmx_copy(void); +int enter_vmx_ops(void); void * exit_vmx_copy(void *dest); /* Traps */ diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S index ca5fc8f..9e7729e 100644 --- a/arch/powerpc/lib/copypage_power7.S +++ b/arch/powerpc/lib/copypage_power7.S @@ -60,7 +60,7 @@ _GLOBAL(copypage_power7) std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) std r0,16(r1) stdu r1,-STACKFRAMESIZE(r1) - bl enter_vmx_copy + bl enter_vmx_ops cmpwi r3,0 ld r0,STACKFRAMESIZE+16(r1) ld r3,STK_REG(R31)(r1) diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S index 6dccfb8..40218fc 100644 --- a/arch/powerpc/lib/memcmp_64.S +++ b/arch/powerpc/lib/memcmp_64.S @@ -162,6 +162,13 @@ _GLOBAL(memcmp) blr .Llong: +#ifdef CONFIG_ALTIVEC + /* Try to use vmx loop if length is larger than 4K */ + cmpldi cr6,r5,4096 + bgt cr6,.Lvmx_cmp + +.Llong_novmx_cmp: +#endif li off8,8 li off16,16 li off24,24 @@ -319,4 +326,79 @@ _GLOBAL(memcmp) 8: blr +#ifdef CONFIG_ALTIVEC +.Lvmx_cmp: + mflr r0 + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl enter_vmx_ops + cmpwi cr1,r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STK_REG(R31)(r1) + ld r4,STK_REG(R30)(r1) + ld r5,STK_REG(R29)(r1) + addi r1,r1,STACKFRAMESIZE + mtlr r0 + beq cr1,.Llong_novmx_cmp + +3: + /* Enter with src/dst address 8 bytes aligned, and len is + * no less than 4KB. Need to align with 16 bytes further. + */ + andi. rA,r3,8 + beq 4f + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + bne cr0,.LcmpAB_lightweight + + addi r3,r3,8 + addi r4,r4,8 + addi r5,r5,-8 + +4: + /* compare 32 bytes for each loop */ + srdi r0,r5,5 + mtctr r0 + andi. r5,r5,31 + li off16,16 + +.balign 16 +5: + lvx v0,0,r3 + lvx v1,0,r4 + vcmpequd. v0,v0,v1 + bf 24,7f + lvx v0,off16,r3 + lvx v1,off16,r4 + vcmpequd. v0,v0,v1 + bf 24,6f + addi r3,r3,32 + addi r4,r4,32 + bdnz 5b + + cmpdi r5,0 + beq .Lzero + b .L8bytes_aligned + +6: + addi r3,r3,16 + addi r4,r4,16 + +7: + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + bne cr0,.LcmpAB_lightweight + + li off8,8 + LD rA,off8,r3 + LD rB,off8,r4 + cmpld cr0,rA,rB + bne cr0,.LcmpAB_lightweight + b .Lzero +#endif EXPORT_SYMBOL(memcmp) diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S index 193909a..682e386 100644 --- a/arch/powerpc/lib/memcpy_power7.S +++ b/arch/powerpc/lib/memcpy_power7.S @@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7) std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) std r0,16(r1) stdu r1,-STACKFRAMESIZE(r1) - bl enter_vmx_copy + bl enter_vmx_ops cmpwi cr1,r3,0 ld r0,STACKFRAMESIZE+16(r1) ld r3,STK_REG(R31)(r1) diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c index bf925cd..923a9ab 100644 --- a/arch/powerpc/lib/vmx-helper.c +++ b/arch/powerpc/lib/vmx-helper.c @@ -53,7 +53,7 @@ int exit_vmx_usercopy(void) return 0; } -int enter_vmx_copy(void) +int enter_vmx_ops(void) { if (in_interrupt()) return 0;