diff mbox series

[v2,2/3] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision

Message ID 1505950480-14830-3-git-send-email-wei.guo.simon@gmail.com (mailing list archive)
State Superseded
Headers show
Series powerpc/64: memcmp() optimization | expand

Commit Message

Simon Guo Sept. 20, 2017, 11:34 p.m. UTC
From: Simon Guo <wei.guo.simon@gmail.com>

This patch add VMX primitives to do memcmp() in case the compare size
exceeds 4K bytes.

Test result with following test program(replace the "^>" with ""):
------
># cat tools/testing/selftests/powerpc/stringloops/memcmp.c
>#include <malloc.h>
>#include <stdlib.h>
>#include <string.h>
>#include <time.h>
>#include "utils.h"
>#define SIZE (1024 * 1024 * 900)
>#define ITERATIONS 40

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
        char *s1;
        char *s2;
        unsigned long i;

        s1 = memalign(128, SIZE);
        if (!s1) {
                perror("memalign");
                exit(1);
        }

        s2 = memalign(128, SIZE);
        if (!s2) {
                perror("memalign");
                exit(1);
        }

        for (i = 0; i < SIZE; i++)  {
                s1[i] = i & 0xff;
                s2[i] = i & 0xff;
        }
        for (i = 0; i < ITERATIONS; i++) {
		int ret = test_memcmp(s1, s2, SIZE);

		if (ret) {
			printf("return %d at[%ld]! should have returned zero\n", ret, i);
			abort();
		}
	}

        return 0;
}

int main(void)
{
        return test_harness(testcase, "memcmp");
}
------
Without VMX patch:
       7.435191479 seconds time elapsed                                          ( +- 0.51% )
With VMX patch:
       6.802038938 seconds time elapsed                                          ( +- 0.56% )
		There is ~+8% improvement.

However I am not aware whether there is use case in kernel for memcmp on
large size yet.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/include/asm/asm-prototypes.h |  2 +-
 arch/powerpc/lib/copypage_power7.S        |  2 +-
 arch/powerpc/lib/memcmp_64.S              | 82 +++++++++++++++++++++++++++++++
 arch/powerpc/lib/memcpy_power7.S          |  2 +-
 arch/powerpc/lib/vmx-helper.c             |  2 +-
 5 files changed, 86 insertions(+), 4 deletions(-)

Comments

Simon Guo Sept. 21, 2017, 12:54 a.m. UTC | #1
Hi,
On Thu, Sep 21, 2017 at 07:34:39AM +0800, wei.guo.simon@gmail.com wrote:
> From: Simon Guo <wei.guo.simon@gmail.com>
> 
> This patch add VMX primitives to do memcmp() in case the compare size
> exceeds 4K bytes.
> 
> Test result with following test program(replace the "^>" with ""):
> ------
I missed the exit_vmx_ops() part and need to rework on v3.

Thanks,
- Simon
Cyril Bur Sept. 22, 2017, 2:06 p.m. UTC | #2
On Thu, 2017-09-21 at 07:34 +0800, wei.guo.simon@gmail.com wrote:
> From: Simon Guo <wei.guo.simon@gmail.com>
> 
> This patch add VMX primitives to do memcmp() in case the compare size
> exceeds 4K bytes.
> 

Hi Simon,

Sorry I didn't see this sooner, I've actually been working on a kernel
version of glibc commit dec4a7105e (powerpc: Improve memcmp performance
for POWER8) unfortunately I've been distracted and it still isn't done.
I wonder if we can consolidate our efforts here. One thing I did come
across in my testing is that for memcmp() that will fail early (I
haven't narrowed down the the optimal number yet) the cost of enabling
VMX actually turns out to be a performance regression, as such I've
added a small check of the first 64 bytes to the start before enabling
VMX to ensure the penalty is worth taking.

Also, you should consider doing 4K and greater, KSM (Kernel Samepage
Merging) uses PAGE_SIZE which can be as small as 4K.

Cyril

> Test result with following test program(replace the "^>" with ""):
> ------
> > # cat tools/testing/selftests/powerpc/stringloops/memcmp.c
> > #include <malloc.h>
> > #include <stdlib.h>
> > #include <string.h>
> > #include <time.h>
> > #include "utils.h"
> > #define SIZE (1024 * 1024 * 900)
> > #define ITERATIONS 40
> 
> int test_memcmp(const void *s1, const void *s2, size_t n);
> 
> static int testcase(void)
> {
>         char *s1;
>         char *s2;
>         unsigned long i;
> 
>         s1 = memalign(128, SIZE);
>         if (!s1) {
>                 perror("memalign");
>                 exit(1);
>         }
> 
>         s2 = memalign(128, SIZE);
>         if (!s2) {
>                 perror("memalign");
>                 exit(1);
>         }
> 
>         for (i = 0; i < SIZE; i++)  {
>                 s1[i] = i & 0xff;
>                 s2[i] = i & 0xff;
>         }
>         for (i = 0; i < ITERATIONS; i++) {
> 		int ret = test_memcmp(s1, s2, SIZE);
> 
> 		if (ret) {
> 			printf("return %d at[%ld]! should have returned zero\n", ret, i);
> 			abort();
> 		}
> 	}
> 
>         return 0;
> }
> 
> int main(void)
> {
>         return test_harness(testcase, "memcmp");
> }
> ------
> Without VMX patch:
>        7.435191479 seconds time elapsed                                          ( +- 0.51% )
> With VMX patch:
>        6.802038938 seconds time elapsed                                          ( +- 0.56% )
> 		There is ~+8% improvement.
> 
> However I am not aware whether there is use case in kernel for memcmp on
> large size yet.
> 
> Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
> ---
>  arch/powerpc/include/asm/asm-prototypes.h |  2 +-
>  arch/powerpc/lib/copypage_power7.S        |  2 +-
>  arch/powerpc/lib/memcmp_64.S              | 82 +++++++++++++++++++++++++++++++
>  arch/powerpc/lib/memcpy_power7.S          |  2 +-
>  arch/powerpc/lib/vmx-helper.c             |  2 +-
>  5 files changed, 86 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
> index 7330150..e6530d8 100644
> --- a/arch/powerpc/include/asm/asm-prototypes.h
> +++ b/arch/powerpc/include/asm/asm-prototypes.h
> @@ -49,7 +49,7 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
>  /* VMX copying */
>  int enter_vmx_usercopy(void);
>  int exit_vmx_usercopy(void);
> -int enter_vmx_copy(void);
> +int enter_vmx_ops(void);
>  void * exit_vmx_copy(void *dest);
>  
>  /* Traps */
> diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
> index ca5fc8f..9e7729e 100644
> --- a/arch/powerpc/lib/copypage_power7.S
> +++ b/arch/powerpc/lib/copypage_power7.S
> @@ -60,7 +60,7 @@ _GLOBAL(copypage_power7)
>  	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
>  	std	r0,16(r1)
>  	stdu	r1,-STACKFRAMESIZE(r1)
> -	bl	enter_vmx_copy
> +	bl	enter_vmx_ops
>  	cmpwi	r3,0
>  	ld	r0,STACKFRAMESIZE+16(r1)
>  	ld	r3,STK_REG(R31)(r1)
> diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
> index 6dccfb8..40218fc 100644
> --- a/arch/powerpc/lib/memcmp_64.S
> +++ b/arch/powerpc/lib/memcmp_64.S
> @@ -162,6 +162,13 @@ _GLOBAL(memcmp)
>  	blr
>  
>  .Llong:
> +#ifdef CONFIG_ALTIVEC
> +	/* Try to use vmx loop if length is larger than 4K */
> +	cmpldi  cr6,r5,4096
> +	bgt	cr6,.Lvmx_cmp
> +
> +.Llong_novmx_cmp:
> +#endif
>  	li	off8,8
>  	li	off16,16
>  	li	off24,24
> @@ -319,4 +326,79 @@ _GLOBAL(memcmp)
>  8:
>  	blr
>  
> +#ifdef CONFIG_ALTIVEC
> +.Lvmx_cmp:
> +	mflr    r0
> +	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
> +	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
> +	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
> +	std     r0,16(r1)
> +	stdu    r1,-STACKFRAMESIZE(r1)
> +	bl      enter_vmx_ops
> +	cmpwi   cr1,r3,0
> +	ld      r0,STACKFRAMESIZE+16(r1)
> +	ld      r3,STK_REG(R31)(r1)
> +	ld      r4,STK_REG(R30)(r1)
> +	ld      r5,STK_REG(R29)(r1)
> +	addi	r1,r1,STACKFRAMESIZE
> +	mtlr    r0
> +	beq     cr1,.Llong_novmx_cmp
> +
> +3:
> +	/* Enter with src/dst address 8 bytes aligned, and len is
> +	 * no less than 4KB. Need to align with 16 bytes further.
> +	 */
> +	andi.	rA,r3,8
> +	beq	4f
> +	LD	rA,0,r3
> +	LD	rB,0,r4
> +	cmpld	cr0,rA,rB
> +	bne	cr0,.LcmpAB_lightweight
> +
> +	addi	r3,r3,8
> +	addi	r4,r4,8
> +	addi	r5,r5,-8
> +
> +4:
> +	/* compare 32 bytes for each loop */
> +	srdi	r0,r5,5
> +	mtctr	r0
> +	andi.	r5,r5,31
> +	li	off16,16
> +
> +.balign 16
> +5:
> +	lvx 	v0,0,r3
> +	lvx 	v1,0,r4
> +	vcmpequd. v0,v0,v1
> +	bf	24,7f
> +	lvx 	v0,off16,r3
> +	lvx 	v1,off16,r4
> +	vcmpequd. v0,v0,v1
> +	bf	24,6f
> +	addi	r3,r3,32
> +	addi	r4,r4,32
> +	bdnz	5b
> +
> +	cmpdi	r5,0
> +	beq	.Lzero
> +	b	.L8bytes_aligned
> +
> +6:
> +	addi	r3,r3,16
> +	addi	r4,r4,16
> +
> +7:
> +	LD	rA,0,r3
> +	LD	rB,0,r4
> +	cmpld	cr0,rA,rB
> +	bne	cr0,.LcmpAB_lightweight
> +
> +	li	off8,8
> +	LD	rA,off8,r3
> +	LD	rB,off8,r4
> +	cmpld	cr0,rA,rB
> +	bne	cr0,.LcmpAB_lightweight
> +	b	.Lzero
> +#endif
>  EXPORT_SYMBOL(memcmp)
> diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
> index 193909a..682e386 100644
> --- a/arch/powerpc/lib/memcpy_power7.S
> +++ b/arch/powerpc/lib/memcpy_power7.S
> @@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7)
>  	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
>  	std	r0,16(r1)
>  	stdu	r1,-STACKFRAMESIZE(r1)
> -	bl	enter_vmx_copy
> +	bl	enter_vmx_ops
>  	cmpwi	cr1,r3,0
>  	ld	r0,STACKFRAMESIZE+16(r1)
>  	ld	r3,STK_REG(R31)(r1)
> diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
> index bf925cd..923a9ab 100644
> --- a/arch/powerpc/lib/vmx-helper.c
> +++ b/arch/powerpc/lib/vmx-helper.c
> @@ -53,7 +53,7 @@ int exit_vmx_usercopy(void)
>  	return 0;
>  }
>  
> -int enter_vmx_copy(void)
> +int enter_vmx_ops(void)
>  {
>  	if (in_interrupt())
>  		return 0;
Simon Guo Sept. 23, 2017, 9:18 p.m. UTC | #3
Hi Cyril,
On Sat, Sep 23, 2017 at 12:06:48AM +1000, Cyril Bur wrote:
> On Thu, 2017-09-21 at 07:34 +0800, wei.guo.simon@gmail.com wrote:
> > From: Simon Guo <wei.guo.simon@gmail.com>
> > 
> > This patch add VMX primitives to do memcmp() in case the compare size
> > exceeds 4K bytes.
> > 
> 
> Hi Simon,
> 
> Sorry I didn't see this sooner, I've actually been working on a kernel
> version of glibc commit dec4a7105e (powerpc: Improve memcmp performance
> for POWER8) unfortunately I've been distracted and it still isn't done.
Thanks for sync with me. Let's consolidate our effort together :)

I have a quick check on glibc commit dec4a7105e. 
Looks the aligned case comparison with VSX is launched without rN size
limitation, which means it will have a VSX reg load penalty even when the 
length is 9 bytes.

It did some optimization when src/dest addrs don't have the same offset 
on 8 bytes alignment boundary. I need to read more closely.

> I wonder if we can consolidate our efforts here. One thing I did come
> across in my testing is that for memcmp() that will fail early (I
> haven't narrowed down the the optimal number yet) the cost of enabling
> VMX actually turns out to be a performance regression, as such I've
> added a small check of the first 64 bytes to the start before enabling
> VMX to ensure the penalty is worth taking.
Will there still be a penalty if the 65th byte differs?  

> 
> Also, you should consider doing 4K and greater, KSM (Kernel Samepage
> Merging) uses PAGE_SIZE which can be as small as 4K.
Currently the VMX will only be applied when size exceeds 4K. Are you
suggesting a bigger threshold than 4K?

We can sync more offline for v3.

Thanks,
- Simon
Cyril Bur Sept. 25, 2017, 11:59 p.m. UTC | #4
On Sun, 2017-09-24 at 05:18 +0800, Simon Guo wrote:
> Hi Cyril,
> On Sat, Sep 23, 2017 at 12:06:48AM +1000, Cyril Bur wrote:
> > On Thu, 2017-09-21 at 07:34 +0800, wei.guo.simon@gmail.com wrote:
> > > From: Simon Guo <wei.guo.simon@gmail.com>
> > > 
> > > This patch add VMX primitives to do memcmp() in case the compare size
> > > exceeds 4K bytes.
> > > 
> > 
> > Hi Simon,
> > 
> > Sorry I didn't see this sooner, I've actually been working on a kernel
> > version of glibc commit dec4a7105e (powerpc: Improve memcmp performance
> > for POWER8) unfortunately I've been distracted and it still isn't done.
> 
> Thanks for sync with me. Let's consolidate our effort together :)
> 
> I have a quick check on glibc commit dec4a7105e. 
> Looks the aligned case comparison with VSX is launched without rN size
> limitation, which means it will have a VSX reg load penalty even when the 
> length is 9 bytes.
> 

This was written for userspace which doesn't have to explicitly enable
VMX in order to use it - we need to be smarter in the kernel.

> It did some optimization when src/dest addrs don't have the same offset 
> on 8 bytes alignment boundary. I need to read more closely.
> 
> > I wonder if we can consolidate our efforts here. One thing I did come
> > across in my testing is that for memcmp() that will fail early (I
> > haven't narrowed down the the optimal number yet) the cost of enabling
> > VMX actually turns out to be a performance regression, as such I've
> > added a small check of the first 64 bytes to the start before enabling
> > VMX to ensure the penalty is worth taking.
> 
> Will there still be a penalty if the 65th byte differs?  
> 

I haven't benchmarked it exactly, my rationale for 64 bytes was that it
is the stride of the vectorised copy loop so, if we know we'll fail
before even completing one iteration of the vectorized loop there isn't
any point using the vector regs.

> > 
> > Also, you should consider doing 4K and greater, KSM (Kernel Samepage
> > Merging) uses PAGE_SIZE which can be as small as 4K.
> 
> Currently the VMX will only be applied when size exceeds 4K. Are you
> suggesting a bigger threshold than 4K?
> 

Equal to or greater than 4K, KSM will benefit.

> We can sync more offline for v3.
> 
> Thanks,
> - Simon
Michael Ellerman Sept. 26, 2017, 5:34 a.m. UTC | #5
Cyril Bur <cyrilbur@gmail.com> writes:

> On Sun, 2017-09-24 at 05:18 +0800, Simon Guo wrote:
>> Hi Cyril,
>> On Sat, Sep 23, 2017 at 12:06:48AM +1000, Cyril Bur wrote:
>> > On Thu, 2017-09-21 at 07:34 +0800, wei.guo.simon@gmail.com wrote:
>> > > From: Simon Guo <wei.guo.simon@gmail.com>
>> > > 
>> > > This patch add VMX primitives to do memcmp() in case the compare size
>> > > exceeds 4K bytes.
>> > 
>> > Sorry I didn't see this sooner, I've actually been working on a kernel
>> > version of glibc commit dec4a7105e (powerpc: Improve memcmp performance
>> > for POWER8) unfortunately I've been distracted and it still isn't done.
>> 
>> Thanks for sync with me. Let's consolidate our effort together :)
>> 
>> I have a quick check on glibc commit dec4a7105e. 
>> Looks the aligned case comparison with VSX is launched without rN size
>> limitation, which means it will have a VSX reg load penalty even when the 
>> length is 9 bytes.
>> 
>
> This was written for userspace which doesn't have to explicitly enable
> VMX in order to use it - we need to be smarter in the kernel.

Well the kernel has to do it for them after a trap, which is actually
even more expensive, so arguably the glibc code should be smarter too
and the threshold before using VMX should probably be higher than in the
kernel (to cover the cost of the trap).

But I digress :)

cheers
Segher Boessenkool Sept. 26, 2017, 11:26 a.m. UTC | #6
On Tue, Sep 26, 2017 at 03:34:36PM +1000, Michael Ellerman wrote:
> Cyril Bur <cyrilbur@gmail.com> writes:
> > This was written for userspace which doesn't have to explicitly enable
> > VMX in order to use it - we need to be smarter in the kernel.
> 
> Well the kernel has to do it for them after a trap, which is actually
> even more expensive, so arguably the glibc code should be smarter too
> and the threshold before using VMX should probably be higher than in the
> kernel (to cover the cost of the trap).

A lot of userspace code uses V*X, more and more with newer CPUs and newer
compiler versions.  If you already paid the price for using vector
registers you do not need to again :-)

> But I digress :)

Yeah sorry :-)


Segher
Michael Ellerman Sept. 27, 2017, 3:38 a.m. UTC | #7
Segher Boessenkool <segher@kernel.crashing.org> writes:

> On Tue, Sep 26, 2017 at 03:34:36PM +1000, Michael Ellerman wrote:
>> Cyril Bur <cyrilbur@gmail.com> writes:
>> > This was written for userspace which doesn't have to explicitly enable
>> > VMX in order to use it - we need to be smarter in the kernel.
>> 
>> Well the kernel has to do it for them after a trap, which is actually
>> even more expensive, so arguably the glibc code should be smarter too
>> and the threshold before using VMX should probably be higher than in the
>> kernel (to cover the cost of the trap).
>
> A lot of userspace code uses V*X, more and more with newer CPUs and newer
> compiler versions.  If you already paid the price for using vector
> registers you do not need to again :-)

True, but you don't know if you've paid the price already.

You also pay the price on every context switch (more state to switch),
so it's not free even once enabled. Which is why the kernel will
eventually turn it off if it's unused again.

But now that I've actually looked at the glibc version, it does do some
checks for minimum length before doing any vector instructions, so
that's probably all we want. The exact trade off between checking some
bytes without vector vs turning on vector depends on your input data, so
it's tricky to tune in general.

>> But I digress :)
>
> Yeah sorry :-)

:)

cheers
Segher Boessenkool Sept. 27, 2017, 9:27 a.m. UTC | #8
On Wed, Sep 27, 2017 at 01:38:09PM +1000, Michael Ellerman wrote:
> Segher Boessenkool <segher@kernel.crashing.org> writes:
> > A lot of userspace code uses V*X, more and more with newer CPUs and newer
> > compiler versions.  If you already paid the price for using vector
> > registers you do not need to again :-)
> 
> True, but you don't know if you've paid the price already.
> 
> You also pay the price on every context switch (more state to switch),
> so it's not free even once enabled. Which is why the kernel will
> eventually turn it off if it's unused again.

Yup.  But my point is that because user space code uses vector registers
more and more, the penalty for user space code to use vector registers
even more keeps shrinking.

> But now that I've actually looked at the glibc version, it does do some
> checks for minimum length before doing any vector instructions, so
> that's probably all we want. The exact trade off between checking some
> bytes without vector vs turning on vector depends on your input data, so
> it's tricky to tune in general.

You also need nasty code to deal with the start and end of strings, with
conditional branches and whatnot, which quickly overwhelms the benefit
of using vector registers at all.  This tradeoff also changes with newer
ISA versions.

Things have to become *really* cheap before it will be good to often use
vector registers in the kernel though.


Segher
David Laight Sept. 27, 2017, 9:43 a.m. UTC | #9
From: Segher Boessenkool
> Sent: 27 September 2017 10:28
...
> You also need nasty code to deal with the start and end of strings, with
> conditional branches and whatnot, which quickly overwhelms the benefit
> of using vector registers at all.  This tradeoff also changes with newer
> ISA versions.

The goal posts keep moving.
For instance with modern intel x86 cpus 'rep movsb' is by far the fastest
way to copy data (from cached memory).

> Things have to become *really* cheap before it will be good to often use
> vector registers in the kernel though.

I've had thoughts about this in the past.
If the vector registers belong to the current process then you might
get away with just saving the ones you want to use.
If they belong to a different process then you also need to tell the
FPU save code where you've saved the registers.
Then the IPI code can recover all the correct values.

On X86 all the AVX registers are caller saved, the system call
entry could issue the instruction that invalidates them all.
Kernel code running in the context of a user process could then
use the registers without saving them.
It would only need to set a mark to ensure they are invalidated
again on return to user (might be cheap enough to do anyway).
Dunno about PPC though.

	David
Simon Guo Sept. 27, 2017, 4:22 p.m. UTC | #10
Hi Michael,
On Wed, Sep 27, 2017 at 01:38:09PM +1000, Michael Ellerman wrote:
> Segher Boessenkool <segher@kernel.crashing.org> writes:
> 
> > On Tue, Sep 26, 2017 at 03:34:36PM +1000, Michael Ellerman wrote:
> >> Cyril Bur <cyrilbur@gmail.com> writes:
> >> > This was written for userspace which doesn't have to explicitly enable
> >> > VMX in order to use it - we need to be smarter in the kernel.
> >> 
> >> Well the kernel has to do it for them after a trap, which is actually
> >> even more expensive, so arguably the glibc code should be smarter too
> >> and the threshold before using VMX should probably be higher than in the
> >> kernel (to cover the cost of the trap).
> >
> > A lot of userspace code uses V*X, more and more with newer CPUs and newer
> > compiler versions.  If you already paid the price for using vector
> > registers you do not need to again :-)
> 
> True, but you don't know if you've paid the price already.
> 
> You also pay the price on every context switch (more state to switch),
> so it's not free even once enabled. Which is why the kernel will
> eventually turn it off if it's unused again.
> 
> But now that I've actually looked at the glibc version, it does do some
> checks for minimum length before doing any vector instructions, so

Looks the glibc version will use VSX instruction and lead to trap in a 
9 bytes size cmp with src/dst 16 bytes aligned. 
 132         /* Now both rSTR1 and rSTR2 are aligned to QW.  */
 133         .align  4
 134 L(qw_align):
 135         vspltisb        v0, 0
 136         srdi.   r6, rN, 6
 137         li      r8, 16
 138         li      r10, 32
 139         li      r11, 48
 140         ble     cr0, L(lessthan64)
 141         mtctr   r6
 142         vspltisb        v8, 0
 143         vspltisb        v6, 0
 144         /* Aligned vector loop.  */
 145         .align  4
line 135 is the VSX instruction causing trap. Did I miss anything?

> that's probably all we want. The exact trade off between checking some
> bytes without vector vs turning on vector depends on your input data, so
> it's tricky to tune in general.

Discussed offline with Cyril. The plan is to use (>=4KB) as the minimum len 
before vector regs steps at v3. Cyril will consolidate his existing work on 
KSM optimization later, which is probably making 64bytes comparison-ahead to 
determine whether it is an early or late matching pattern.

Cyril has also some other valuable comments and I will rework on v3.

Is it OK for you?

Thanks,
- Simon
Simon Guo Sept. 27, 2017, 6:33 p.m. UTC | #11
On Wed, Sep 27, 2017 at 09:43:44AM +0000, David Laight wrote:
> From: Segher Boessenkool
> > Sent: 27 September 2017 10:28
> ...
> > You also need nasty code to deal with the start and end of strings, with
> > conditional branches and whatnot, which quickly overwhelms the benefit
> > of using vector registers at all.  This tradeoff also changes with newer
> > ISA versions.
> 
> The goal posts keep moving.
> For instance with modern intel x86 cpus 'rep movsb' is by far the fastest
> way to copy data (from cached memory).
> 
> > Things have to become *really* cheap before it will be good to often use
> > vector registers in the kernel though.
> 
> I've had thoughts about this in the past.
> If the vector registers belong to the current process then you might
> get away with just saving the ones you want to use.
> If they belong to a different process then you also need to tell the
> FPU save code where you've saved the registers.
> Then the IPI code can recover all the correct values.
> 
> On X86 all the AVX registers are caller saved, the system call
> entry could issue the instruction that invalidates them all.
> Kernel code running in the context of a user process could then
> use the registers without saving them.
> It would only need to set a mark to ensure they are invalidated
> again on return to user (might be cheap enough to do anyway).
> Dunno about PPC though.

I am not aware of any ppc instruction which can set a "mark" or provide 
any high granularity flag against single or subgroup of vec regs' validity.
But ppc experts may want to correct me.

Thanks,
- Simon
David Laight Sept. 28, 2017, 9:24 a.m. UTC | #12
From: Simon Guo
> Sent: 27 September 2017 19:34
...
> > On X86 all the AVX registers are caller saved, the system call
> > entry could issue the instruction that invalidates them all.
> > Kernel code running in the context of a user process could then
> > use the registers without saving them.
> > It would only need to set a mark to ensure they are invalidated
> > again on return to user (might be cheap enough to do anyway).
> > Dunno about PPC though.
> 
> I am not aware of any ppc instruction which can set a "mark" or provide
> any high granularity flag against single or subgroup of vec regs' validity.
> But ppc experts may want to correct me.

I was just thinking of a software flag.

	David
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index 7330150..e6530d8 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -49,7 +49,7 @@  void __trace_hcall_exit(long opcode, unsigned long retval,
 /* VMX copying */
 int enter_vmx_usercopy(void);
 int exit_vmx_usercopy(void);
-int enter_vmx_copy(void);
+int enter_vmx_ops(void);
 void * exit_vmx_copy(void *dest);
 
 /* Traps */
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
index ca5fc8f..9e7729e 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -60,7 +60,7 @@  _GLOBAL(copypage_power7)
 	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index 6dccfb8..40218fc 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -162,6 +162,13 @@  _GLOBAL(memcmp)
 	blr
 
 .Llong:
+#ifdef CONFIG_ALTIVEC
+	/* Try to use vmx loop if length is larger than 4K */
+	cmpldi  cr6,r5,4096
+	bgt	cr6,.Lvmx_cmp
+
+.Llong_novmx_cmp:
+#endif
 	li	off8,8
 	li	off16,16
 	li	off24,24
@@ -319,4 +326,79 @@  _GLOBAL(memcmp)
 8:
 	blr
 
+#ifdef CONFIG_ALTIVEC
+.Lvmx_cmp:
+	mflr    r0
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
+	std     r0,16(r1)
+	stdu    r1,-STACKFRAMESIZE(r1)
+	bl      enter_vmx_ops
+	cmpwi   cr1,r3,0
+	ld      r0,STACKFRAMESIZE+16(r1)
+	ld      r3,STK_REG(R31)(r1)
+	ld      r4,STK_REG(R30)(r1)
+	ld      r5,STK_REG(R29)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+	mtlr    r0
+	beq     cr1,.Llong_novmx_cmp
+
+3:
+	/* Enter with src/dst address 8 bytes aligned, and len is
+	 * no less than 4KB. Need to align with 16 bytes further.
+	 */
+	andi.	rA,r3,8
+	beq	4f
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+
+	addi	r3,r3,8
+	addi	r4,r4,8
+	addi	r5,r5,-8
+
+4:
+	/* compare 32 bytes for each loop */
+	srdi	r0,r5,5
+	mtctr	r0
+	andi.	r5,r5,31
+	li	off16,16
+
+.balign 16
+5:
+	lvx 	v0,0,r3
+	lvx 	v1,0,r4
+	vcmpequd. v0,v0,v1
+	bf	24,7f
+	lvx 	v0,off16,r3
+	lvx 	v1,off16,r4
+	vcmpequd. v0,v0,v1
+	bf	24,6f
+	addi	r3,r3,32
+	addi	r4,r4,32
+	bdnz	5b
+
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.L8bytes_aligned
+
+6:
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+7:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+
+	li	off8,8
+	LD	rA,off8,r3
+	LD	rB,off8,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	b	.Lzero
+#endif
 EXPORT_SYMBOL(memcmp)
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
index 193909a..682e386 100644
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -230,7 +230,7 @@  _GLOBAL(memcpy_power7)
 	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	cr1,r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index bf925cd..923a9ab 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -53,7 +53,7 @@  int exit_vmx_usercopy(void)
 	return 0;
 }
 
-int enter_vmx_copy(void)
+int enter_vmx_ops(void)
 {
 	if (in_interrupt())
 		return 0;