diff mbox series

[v5,2/4] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision

Message ID 1527058083-6998-3-git-send-email-wei.guo.simon@gmail.com (mailing list archive)
State Superseded
Headers show
Series powerpc/64: memcmp() optimization | expand

Commit Message

Simon Guo May 23, 2018, 6:48 a.m. UTC
From: Simon Guo <wei.guo.simon@gmail.com>

This patch add VMX primitives to do memcmp() in case the compare size
exceeds 4K bytes. KSM feature can benefit from this.

Test result with following test program(replace the "^>" with ""):
------
># cat tools/testing/selftests/powerpc/stringloops/memcmp.c
>#include <malloc.h>
>#include <stdlib.h>
>#include <string.h>
>#include <time.h>
>#include "utils.h"
>#define SIZE (1024 * 1024 * 900)
>#define ITERATIONS 40

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
        char *s1;
        char *s2;
        unsigned long i;

        s1 = memalign(128, SIZE);
        if (!s1) {
                perror("memalign");
                exit(1);
        }

        s2 = memalign(128, SIZE);
        if (!s2) {
                perror("memalign");
                exit(1);
        }

        for (i = 0; i < SIZE; i++)  {
                s1[i] = i & 0xff;
                s2[i] = i & 0xff;
        }
        for (i = 0; i < ITERATIONS; i++) {
		int ret = test_memcmp(s1, s2, SIZE);

		if (ret) {
			printf("return %d at[%ld]! should have returned zero\n", ret, i);
			abort();
		}
	}

        return 0;
}

int main(void)
{
        return test_harness(testcase, "memcmp");
}
------
Without this patch (but with the first patch "powerpc/64: Align bytes
before fall back to .Lshort in powerpc64 memcmp()." in the series):
	4.726728762 seconds time elapsed                                          ( +-  3.54%)
With VMX patch:
	4.234335473 seconds time elapsed                                          ( +-  2.63%)
		There is ~+10% improvement.

Testing with unaligned and different offset version (make s1 and s2 shift
random offset within 16 bytes) can archieve higher improvement than 10%..

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/include/asm/asm-prototypes.h |   4 +-
 arch/powerpc/lib/copypage_power7.S        |   4 +-
 arch/powerpc/lib/memcmp_64.S              | 231 ++++++++++++++++++++++++++++++
 arch/powerpc/lib/memcpy_power7.S          |   6 +-
 arch/powerpc/lib/vmx-helper.c             |   4 +-
 5 files changed, 240 insertions(+), 9 deletions(-)

Comments

Simon Guo May 23, 2018, 3:37 p.m. UTC | #1
Hi Michael,
On Thu, May 24, 2018 at 05:44:33PM +1000, Michael Ellerman wrote:
> Hi Simon,
> 
> wei.guo.simon@gmail.com writes:
> > From: Simon Guo <wei.guo.simon@gmail.com>
> >
> > This patch add VMX primitives to do memcmp() in case the compare size
> > exceeds 4K bytes. KSM feature can benefit from this.
> 
> You say "exceeds 4K" here.
> 
it should be >= 4k. I will correct the message.

> > diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
> > index f20e883..6303bbf 100644
> > --- a/arch/powerpc/lib/memcmp_64.S
> > +++ b/arch/powerpc/lib/memcmp_64.S
> > @@ -27,12 +27,73 @@
> >  #define LH	lhbrx
> >  #define LW	lwbrx
> >  #define LD	ldbrx
> > +#define LVS	lvsr
> > +#define VPERM(_VRT,_VRA,_VRB,_VRC) \
> > +	vperm _VRT,_VRB,_VRA,_VRC
> >  #else
> >  #define LH	lhzx
> >  #define LW	lwzx
> >  #define LD	ldx
> > +#define LVS	lvsl
> > +#define VPERM(_VRT,_VRA,_VRB,_VRC) \
> > +	vperm _VRT,_VRA,_VRB,_VRC
> >  #endif
> >  
> > +#define VMX_OPS_THRES 4096
> 
> THRES == 4096
> 
> BTW, can we call it VMX_THRESH ?
> 
Sure. I will update it.

> > +#define ENTER_VMX_OPS	\
> > +	mflr    r0;	\
> > +	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
> > +	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
> > +	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
> > +	std     r0,16(r1); \
> > +	stdu    r1,-STACKFRAMESIZE(r1); \
> > +	bl      enter_vmx_ops; \
> > +	cmpwi   cr1,r3,0; \
> > +	ld      r0,STACKFRAMESIZE+16(r1); \
> > +	ld      r3,STK_REG(R31)(r1); \
> > +	ld      r4,STK_REG(R30)(r1); \
> > +	ld      r5,STK_REG(R29)(r1); \
> > +	addi	r1,r1,STACKFRAMESIZE; \
> > +	mtlr    r0
> > +
> > +#define EXIT_VMX_OPS \
> > +	mflr    r0; \
> > +	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
> > +	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
> > +	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
> > +	std     r0,16(r1); \
> > +	stdu    r1,-STACKFRAMESIZE(r1); \
> > +	bl      exit_vmx_ops; \
> > +	ld      r0,STACKFRAMESIZE+16(r1); \
> > +	ld      r3,STK_REG(R31)(r1); \
> > +	ld      r4,STK_REG(R30)(r1); \
> > +	ld      r5,STK_REG(R29)(r1); \
> > +	addi	r1,r1,STACKFRAMESIZE; \
> > +	mtlr    r0
> > +
> > +/*
> > + * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
> > + * 16 bytes boundary and permute the result with the 1st 16 bytes.
> > +
> > + *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
> > + *    ^                                  ^                                 ^
> > + * 0xbbbb10                          0xbbbb20                          0xbbb30
> > + *                                 ^
> > + *                                _vaddr
> > + *
> > + *
> > + * _vmask is the mask generated by LVS
> > + * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
> > + *   for example: 0xyyyyyyyyyyyyy012 for big endian
> > + * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
> > + *   for example: 0x3456789abcdefzzz for big endian
> > + * The permute result is saved in _v_res.
> > + *   for example: 0x0123456789abcdef for big endian.
> > + */
> > +#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
> > +        lvx     _v2nd_qw,_vaddr,off16; \
> > +        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
> > +
> >  /*
> >   * There are 2 categories for memcmp:
> >   * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
> > @@ -174,6 +235,13 @@ _GLOBAL(memcmp)
> >  	blr
> >  
> >  .Llong:
> > +#ifdef CONFIG_ALTIVEC
> > +	/* Try to use vmx loop if length is larger than 4K */
> > +	cmpldi  cr6,r5,VMX_OPS_THRES
> > +	bge	cr6,.Lsameoffset_vmx_cmp
> 
> Here we compare the length to 4K and if it's greater *or equal* then we
> go to the VMX case. Or am I reading it backward?
> 
> So we should say "if the size is 4K or more we do VMX" shouldn't we?
Yes. Again I need reword the comment to "equal or greater than 4K"
here.

Thanks,
- Simon
Michael Ellerman May 24, 2018, 7:44 a.m. UTC | #2
Hi Simon,

wei.guo.simon@gmail.com writes:
> From: Simon Guo <wei.guo.simon@gmail.com>
>
> This patch add VMX primitives to do memcmp() in case the compare size
> exceeds 4K bytes. KSM feature can benefit from this.

You say "exceeds 4K" here.

> diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
> index f20e883..6303bbf 100644
> --- a/arch/powerpc/lib/memcmp_64.S
> +++ b/arch/powerpc/lib/memcmp_64.S
> @@ -27,12 +27,73 @@
>  #define LH	lhbrx
>  #define LW	lwbrx
>  #define LD	ldbrx
> +#define LVS	lvsr
> +#define VPERM(_VRT,_VRA,_VRB,_VRC) \
> +	vperm _VRT,_VRB,_VRA,_VRC
>  #else
>  #define LH	lhzx
>  #define LW	lwzx
>  #define LD	ldx
> +#define LVS	lvsl
> +#define VPERM(_VRT,_VRA,_VRB,_VRC) \
> +	vperm _VRT,_VRA,_VRB,_VRC
>  #endif
>  
> +#define VMX_OPS_THRES 4096

THRES == 4096

BTW, can we call it VMX_THRESH ?

> +#define ENTER_VMX_OPS	\
> +	mflr    r0;	\
> +	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
> +	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
> +	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
> +	std     r0,16(r1); \
> +	stdu    r1,-STACKFRAMESIZE(r1); \
> +	bl      enter_vmx_ops; \
> +	cmpwi   cr1,r3,0; \
> +	ld      r0,STACKFRAMESIZE+16(r1); \
> +	ld      r3,STK_REG(R31)(r1); \
> +	ld      r4,STK_REG(R30)(r1); \
> +	ld      r5,STK_REG(R29)(r1); \
> +	addi	r1,r1,STACKFRAMESIZE; \
> +	mtlr    r0
> +
> +#define EXIT_VMX_OPS \
> +	mflr    r0; \
> +	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
> +	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
> +	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
> +	std     r0,16(r1); \
> +	stdu    r1,-STACKFRAMESIZE(r1); \
> +	bl      exit_vmx_ops; \
> +	ld      r0,STACKFRAMESIZE+16(r1); \
> +	ld      r3,STK_REG(R31)(r1); \
> +	ld      r4,STK_REG(R30)(r1); \
> +	ld      r5,STK_REG(R29)(r1); \
> +	addi	r1,r1,STACKFRAMESIZE; \
> +	mtlr    r0
> +
> +/*
> + * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
> + * 16 bytes boundary and permute the result with the 1st 16 bytes.
> +
> + *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
> + *    ^                                  ^                                 ^
> + * 0xbbbb10                          0xbbbb20                          0xbbb30
> + *                                 ^
> + *                                _vaddr
> + *
> + *
> + * _vmask is the mask generated by LVS
> + * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
> + *   for example: 0xyyyyyyyyyyyyy012 for big endian
> + * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
> + *   for example: 0x3456789abcdefzzz for big endian
> + * The permute result is saved in _v_res.
> + *   for example: 0x0123456789abcdef for big endian.
> + */
> +#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
> +        lvx     _v2nd_qw,_vaddr,off16; \
> +        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
> +
>  /*
>   * There are 2 categories for memcmp:
>   * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
> @@ -174,6 +235,13 @@ _GLOBAL(memcmp)
>  	blr
>  
>  .Llong:
> +#ifdef CONFIG_ALTIVEC
> +	/* Try to use vmx loop if length is larger than 4K */
> +	cmpldi  cr6,r5,VMX_OPS_THRES
> +	bge	cr6,.Lsameoffset_vmx_cmp

Here we compare the length to 4K and if it's greater *or equal* then we
go to the VMX case. Or am I reading it backward?

So we should say "if the size is 4K or more we do VMX" shouldn't we?

cheers
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index d9713ad..31fdcee 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -49,8 +49,8 @@  void __trace_hcall_exit(long opcode, unsigned long retval,
 /* VMX copying */
 int enter_vmx_usercopy(void);
 int exit_vmx_usercopy(void);
-int enter_vmx_copy(void);
-void * exit_vmx_copy(void *dest);
+int enter_vmx_ops(void);
+void *exit_vmx_ops(void *dest);
 
 /* Traps */
 long machine_check_early(struct pt_regs *regs);
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
index 8fa73b7..e38f956 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@  _GLOBAL(copypage_power7)
 	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@  _GLOBAL(copypage_power7)
 	addi	r3,r3,128
 	bdnz	1b
 
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 
 #else
 	li	r0,(PAGE_SIZE/128)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index f20e883..6303bbf 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -27,12 +27,73 @@ 
 #define LH	lhbrx
 #define LW	lwbrx
 #define LD	ldbrx
+#define LVS	lvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+	vperm _VRT,_VRB,_VRA,_VRC
 #else
 #define LH	lhzx
 #define LW	lwzx
 #define LD	ldx
+#define LVS	lvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+	vperm _VRT,_VRA,_VRB,_VRC
 #endif
 
+#define VMX_OPS_THRES 4096
+#define ENTER_VMX_OPS	\
+	mflr    r0;	\
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+	std     r0,16(r1); \
+	stdu    r1,-STACKFRAMESIZE(r1); \
+	bl      enter_vmx_ops; \
+	cmpwi   cr1,r3,0; \
+	ld      r0,STACKFRAMESIZE+16(r1); \
+	ld      r3,STK_REG(R31)(r1); \
+	ld      r4,STK_REG(R30)(r1); \
+	ld      r5,STK_REG(R29)(r1); \
+	addi	r1,r1,STACKFRAMESIZE; \
+	mtlr    r0
+
+#define EXIT_VMX_OPS \
+	mflr    r0; \
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+	std     r0,16(r1); \
+	stdu    r1,-STACKFRAMESIZE(r1); \
+	bl      exit_vmx_ops; \
+	ld      r0,STACKFRAMESIZE+16(r1); \
+	ld      r3,STK_REG(R31)(r1); \
+	ld      r4,STK_REG(R30)(r1); \
+	ld      r5,STK_REG(R29)(r1); \
+	addi	r1,r1,STACKFRAMESIZE; \
+	mtlr    r0
+
+/*
+ * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
+ * 16 bytes boundary and permute the result with the 1st 16 bytes.
+
+ *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
+ *    ^                                  ^                                 ^
+ * 0xbbbb10                          0xbbbb20                          0xbbb30
+ *                                 ^
+ *                                _vaddr
+ *
+ *
+ * _vmask is the mask generated by LVS
+ * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
+ *   for example: 0xyyyyyyyyyyyyy012 for big endian
+ * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
+ *   for example: 0x3456789abcdefzzz for big endian
+ * The permute result is saved in _v_res.
+ *   for example: 0x0123456789abcdef for big endian.
+ */
+#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
+        lvx     _v2nd_qw,_vaddr,off16; \
+        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
+
 /*
  * There are 2 categories for memcmp:
  * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
@@ -174,6 +235,13 @@  _GLOBAL(memcmp)
 	blr
 
 .Llong:
+#ifdef CONFIG_ALTIVEC
+	/* Try to use vmx loop if length is larger than 4K */
+	cmpldi  cr6,r5,VMX_OPS_THRES
+	bge	cr6,.Lsameoffset_vmx_cmp
+
+.Llong_novmx_cmp:
+#endif
 	/* At least s1 addr is aligned with 8 bytes */
 	li	off8,8
 	li	off16,16
@@ -332,7 +400,94 @@  _GLOBAL(memcmp)
 8:
 	blr
 
+#ifdef CONFIG_ALTIVEC
+.Lsameoffset_vmx_cmp:
+	/* Enter with src/dst addrs has the same offset with 8 bytes
+	 * align boundary
+	 */
+	ENTER_VMX_OPS
+	beq     cr1,.Llong_novmx_cmp
+
+3:
+	/* need to check whether r4 has the same offset with r3
+	 * for 16 bytes boundary.
+	 */
+	xor	r0,r3,r4
+	andi.	r0,r0,0xf
+	bne	.Ldiffoffset_vmx_cmp_start
+
+	/* len is no less than 4KB. Need to align with 16 bytes further.
+	 */
+	andi.	rA,r3,8
+	LD	rA,0,r3
+	beq	4f
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	addi	r5,r5,-8
+
+	beq	cr0,4f
+	/* save and restore cr0 */
+	mfocrf  r5,64
+	EXIT_VMX_OPS
+	mtocrf	64,r5
+	b	.LcmpAB_lightweight
+
+4:
+	/* compare 32 bytes for each loop */
+	srdi	r0,r5,5
+	mtctr	r0
+	clrldi  r5,r5,59
+	li	off16,16
+
+.balign 16
+5:
+	lvx 	v0,0,r3
+	lvx 	v1,0,r4
+	vcmpequd. v0,v0,v1
+	bf	24,7f
+	lvx 	v0,off16,r3
+	lvx 	v1,off16,r4
+	vcmpequd. v0,v0,v1
+	bf	24,6f
+	addi	r3,r3,32
+	addi	r4,r4,32
+	bdnz	5b
+
+	EXIT_VMX_OPS
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lcmp_lt32bytes
+
+6:
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+7:
+	/* diff the last 16 bytes */
+	EXIT_VMX_OPS
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	li	off8,8
+	bne	cr0,.LcmpAB_lightweight
+
+	LD	rA,off8,r3
+	LD	rB,off8,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	b	.Lzero
+#endif
+
 .Ldiffoffset_8bytes_make_align_start:
+#ifdef CONFIG_ALTIVEC
+	/* only do vmx ops when the size exceeds 4K bytes */
+	cmpdi	cr5,r5,VMX_OPS_THRES
+	bge	cr5,.Ldiffoffset_vmx_cmp
+.Ldiffoffset_novmx_cmp:
+#endif
+
 	/* now try to align s1 with 8 bytes */
 	andi.   r6,r3,0x7
 	rlwinm  r6,r6,3,0,28
@@ -359,6 +514,82 @@  _GLOBAL(memcmp)
 	/* now s1 is aligned with 8 bytes. */
 	cmpdi   cr5,r5,31
 	ble	cr5,.Lcmp_lt32bytes
+
+#ifdef CONFIG_ALTIVEC
+	b	.Llong_novmx_cmp
+#else
 	b	.Llong
+#endif
+
+#ifdef CONFIG_ALTIVEC
+.Ldiffoffset_vmx_cmp:
+	ENTER_VMX_OPS
+	beq     cr1,.Ldiffoffset_novmx_cmp
+
+.Ldiffoffset_vmx_cmp_start:
+	/* Firstly try to align r3 with 16 bytes */
+	andi.   r6,r3,0xf
+	li	off16,16
+	beq     .Ldiffoffset_vmx_s1_16bytes_align
 
+	LVS	v3,0,r3
+	LVS	v4,0,r4
+
+	lvx     v5,0,r3
+	lvx     v6,0,r4
+	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+
+	vcmpequb.  v7,v9,v10
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	subfic  r6,r6,16
+	subf    r5,r6,r5
+	add     r3,r3,r6
+	add     r4,r4,r6
+
+.Ldiffoffset_vmx_s1_16bytes_align:
+	/* now s1 is aligned with 16 bytes */
+	lvx     v6,0,r4
+	LVS	v4,0,r4
+	srdi	r6,r5,5  /* loop for 32 bytes each */
+	clrldi  r5,r5,59
+	mtctr	r6
+
+.balign	16
+.Ldiffoffset_vmx_32bytesloop:
+	/* the first qw of r4 was saved in v6 */
+	lvx	v9,0,r3
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+	vcmpequb.	v7,v9,v10
+	vor	v6,v8,v8
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	lvx	v9,0,r3
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+	vcmpequb.	v7,v9,v10
+	vor	v6,v8,v8
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	bdnz	.Ldiffoffset_vmx_32bytesloop
+
+	EXIT_VMX_OPS
+
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lcmp_lt32bytes
+
+.Ldiffoffset_vmx_diff_found:
+	EXIT_VMX_OPS
+	/* anyway, the diff will appear in next 16 bytes */
+	li	r5,16
+	b	.Lcmp_lt32bytes
+
+#endif
 EXPORT_SYMBOL(memcmp)
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
index df7de9d..070cdf6 100644
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -230,7 +230,7 @@  _GLOBAL(memcpy_power7)
 	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	cr1,r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
@@ -445,7 +445,7 @@  _GLOBAL(memcpy_power7)
 
 15:	addi	r1,r1,STACKFRAMESIZE
 	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 
 .Lvmx_unaligned_copy:
 	/* Get the destination 16B aligned */
@@ -649,5 +649,5 @@  _GLOBAL(memcpy_power7)
 
 15:	addi	r1,r1,STACKFRAMESIZE
 	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 #endif /* CONFIG_ALTIVEC */
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index bf925cd..9f34049 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -53,7 +53,7 @@  int exit_vmx_usercopy(void)
 	return 0;
 }
 
-int enter_vmx_copy(void)
+int enter_vmx_ops(void)
 {
 	if (in_interrupt())
 		return 0;
@@ -70,7 +70,7 @@  int enter_vmx_copy(void)
  * passed a pointer to the destination which we return as required by a
  * memcpy implementation.
  */
-void *exit_vmx_copy(void *dest)
+void *exit_vmx_ops(void *dest)
 {
 	disable_kernel_altivec();
 	preempt_enable();