diff mbox

[U-Boot,v2,02/12] x86: Add an accelerated memmove() function

Message ID 1475721740-15124-3-git-send-email-sjg@chromium.org
State Accepted
Commit a5b8722532729c62370b6abb278420804d5d071b
Delegated to: Bin Meng
Headers show

Commit Message

Simon Glass Oct. 6, 2016, 2:42 a.m. UTC
Bring in a faster memmove() from Linux 4.7. This speeds up scrolling on the
display.

Signed-off-by: Simon Glass <sjg@chromium.org>
---

Changes in v2:
- Move the code into string.c
- Fix multi-line comments that should not be

 arch/x86/include/asm/string.h |   2 +-
 arch/x86/lib/string.c         | 161 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+), 1 deletion(-)

Comments

Bin Meng Oct. 8, 2016, 2:25 a.m. UTC | #1
Hi Simon,

On Thu, Oct 6, 2016 at 10:42 AM, Simon Glass <sjg@chromium.org> wrote:
> Bring in a faster memmove() from Linux 4.7. This speeds up scrolling on the
> display.
>
> Signed-off-by: Simon Glass <sjg@chromium.org>
> ---
>
> Changes in v2:
> - Move the code into string.c
> - Fix multi-line comments that should not be
>
>  arch/x86/include/asm/string.h |   2 +-
>  arch/x86/lib/string.c         | 161 ++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 162 insertions(+), 1 deletion(-)
>

Reviewed-by: Bin Meng <bmeng.cn@gmail.com>

Tested on Crown Bay with an external PCIe graphics card
Tested-by: Bin Meng <bmeng.cn@gmail.com>

However, I did not see significant speed up on screen scrolling...

Regards,
Bin
Bin Meng Oct. 8, 2016, 4:23 a.m. UTC | #2
On Sat, Oct 8, 2016 at 10:25 AM, Bin Meng <bmeng.cn@gmail.com> wrote:
> Hi Simon,
>
> On Thu, Oct 6, 2016 at 10:42 AM, Simon Glass <sjg@chromium.org> wrote:
>> Bring in a faster memmove() from Linux 4.7. This speeds up scrolling on the
>> display.
>>
>> Signed-off-by: Simon Glass <sjg@chromium.org>
>> ---
>>
>> Changes in v2:
>> - Move the code into string.c
>> - Fix multi-line comments that should not be
>>
>>  arch/x86/include/asm/string.h |   2 +-
>>  arch/x86/lib/string.c         | 161 ++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 162 insertions(+), 1 deletion(-)
>>
>
> Reviewed-by: Bin Meng <bmeng.cn@gmail.com>
>
> Tested on Crown Bay with an external PCIe graphics card
> Tested-by: Bin Meng <bmeng.cn@gmail.com>
>
> However, I did not see significant speed up on screen scrolling...
>

applied to u-boot-x86, thanks!
Bin Meng Oct. 8, 2016, 5:53 a.m. UTC | #3
Hi Simon,

On Sat, Oct 8, 2016 at 10:25 AM, Bin Meng <bmeng.cn@gmail.com> wrote:
> Hi Simon,
>
> On Thu, Oct 6, 2016 at 10:42 AM, Simon Glass <sjg@chromium.org> wrote:
>> Bring in a faster memmove() from Linux 4.7. This speeds up scrolling on the
>> display.
>>
>> Signed-off-by: Simon Glass <sjg@chromium.org>
>> ---
>>
>> Changes in v2:
>> - Move the code into string.c
>> - Fix multi-line comments that should not be
>>
>>  arch/x86/include/asm/string.h |   2 +-
>>  arch/x86/lib/string.c         | 161 ++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 162 insertions(+), 1 deletion(-)
>>
>
> Reviewed-by: Bin Meng <bmeng.cn@gmail.com>
>
> Tested on Crown Bay with an external PCIe graphics card
> Tested-by: Bin Meng <bmeng.cn@gmail.com>
>
> However, I did not see significant speed up on screen scrolling...
>

Crown Bay is still using cfb_console.c, which is the legacy driver
that does not use memmove(). Looks the new console_xxx driver is using
memmove(). I guess I will need convert Crown Bay to use DM video.

Regards,
Bin
Bin Meng Oct. 10, 2016, 2:05 a.m. UTC | #4
On Sat, Oct 8, 2016 at 1:53 PM, Bin Meng <bmeng.cn@gmail.com> wrote:
> Hi Simon,
>
> On Sat, Oct 8, 2016 at 10:25 AM, Bin Meng <bmeng.cn@gmail.com> wrote:
>> Hi Simon,
>>
>> On Thu, Oct 6, 2016 at 10:42 AM, Simon Glass <sjg@chromium.org> wrote:
>>> Bring in a faster memmove() from Linux 4.7. This speeds up scrolling on the
>>> display.
>>>
>>> Signed-off-by: Simon Glass <sjg@chromium.org>
>>> ---
>>>
>>> Changes in v2:
>>> - Move the code into string.c
>>> - Fix multi-line comments that should not be
>>>
>>>  arch/x86/include/asm/string.h |   2 +-
>>>  arch/x86/lib/string.c         | 161 ++++++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 162 insertions(+), 1 deletion(-)
>>>
>>
>> Reviewed-by: Bin Meng <bmeng.cn@gmail.com>
>>
>> Tested on Crown Bay with an external PCIe graphics card
>> Tested-by: Bin Meng <bmeng.cn@gmail.com>
>>
>> However, I did not see significant speed up on screen scrolling...
>>
>
> Crown Bay is still using cfb_console.c, which is the legacy driver
> that does not use memmove(). Looks the new console_xxx driver is using
> memmove(). I guess I will need convert Crown Bay to use DM video.
>

Testing shows that on Crown Bay, the DM driver with the optimized
memmove() does not improve the performance compared to legacy
cfb_console driver. But on Bayleybay, memmove() indeed helps the
screen scrolling.

Regards,
Bin
Simon Glass Oct. 13, 2016, 12:03 a.m. UTC | #5
Hi Bin,

On 9 October 2016 at 20:05, Bin Meng <bmeng.cn@gmail.com> wrote:
> On Sat, Oct 8, 2016 at 1:53 PM, Bin Meng <bmeng.cn@gmail.com> wrote:
>> Hi Simon,
>>
>> On Sat, Oct 8, 2016 at 10:25 AM, Bin Meng <bmeng.cn@gmail.com> wrote:
>>> Hi Simon,
>>>
>>> On Thu, Oct 6, 2016 at 10:42 AM, Simon Glass <sjg@chromium.org> wrote:
>>>> Bring in a faster memmove() from Linux 4.7. This speeds up scrolling on the
>>>> display.
>>>>
>>>> Signed-off-by: Simon Glass <sjg@chromium.org>
>>>> ---
>>>>
>>>> Changes in v2:
>>>> - Move the code into string.c
>>>> - Fix multi-line comments that should not be
>>>>
>>>>  arch/x86/include/asm/string.h |   2 +-
>>>>  arch/x86/lib/string.c         | 161 ++++++++++++++++++++++++++++++++++++++++++
>>>>  2 files changed, 162 insertions(+), 1 deletion(-)
>>>>
>>>
>>> Reviewed-by: Bin Meng <bmeng.cn@gmail.com>
>>>
>>> Tested on Crown Bay with an external PCIe graphics card
>>> Tested-by: Bin Meng <bmeng.cn@gmail.com>
>>>
>>> However, I did not see significant speed up on screen scrolling...
>>>
>>
>> Crown Bay is still using cfb_console.c, which is the legacy driver
>> that does not use memmove(). Looks the new console_xxx driver is using
>> memmove(). I guess I will need convert Crown Bay to use DM video.
>>
>
> Testing shows that on Crown Bay, the DM driver with the optimized
> memmove() does not improve the performance compared to legacy
> cfb_console driver. But on Bayleybay, memmove() indeed helps the
> screen scrolling.

That's interesting. What is the difference between those two
platforms? It is slow on Crown Bay?

Regards,
Simon
diff mbox

Patch

diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
index 0ad612f..38afd23 100644
--- a/arch/x86/include/asm/string.h
+++ b/arch/x86/include/asm/string.h
@@ -17,7 +17,7 @@  extern char * strchr(const char * s, int c);
 #define __HAVE_ARCH_MEMCPY
 extern void * memcpy(void *, const void *, __kernel_size_t);
 
-#undef __HAVE_ARCH_MEMMOVE
+#define __HAVE_ARCH_MEMMOVE
 extern void * memmove(void *, const void *, __kernel_size_t);
 
 #undef __HAVE_ARCH_MEMCHR
diff --git a/arch/x86/lib/string.c b/arch/x86/lib/string.c
index 6c66431..5343c2b 100644
--- a/arch/x86/lib/string.c
+++ b/arch/x86/lib/string.c
@@ -130,3 +130,164 @@  void *memcpy(void *dstpp, const void *srcpp, size_t len)
 
 	return dstpp;
 }
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+	int d0, d1, d2, d3, d4, d5;
+	char *ret = dest;
+
+	__asm__ __volatile__(
+		/* Handle more 16 bytes in loop */
+		"cmp $0x10, %0\n\t"
+		"jb	1f\n\t"
+
+		/* Decide forward/backward copy mode */
+		"cmp %2, %1\n\t"
+		"jb	2f\n\t"
+
+		/*
+		 * movs instruction have many startup latency
+		 * so we handle small size by general register.
+		 */
+		"cmp  $680, %0\n\t"
+		"jb 3f\n\t"
+		/* movs instruction is only good for aligned case */
+		"mov %1, %3\n\t"
+		"xor %2, %3\n\t"
+		"and $0xff, %3\n\t"
+		"jz 4f\n\t"
+		"3:\n\t"
+		"sub $0x10, %0\n\t"
+
+		/* We gobble 16 bytes forward in each loop */
+		"3:\n\t"
+		"sub $0x10, %0\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov 1*4(%1), %4\n\t"
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, 1*4(%2)\n\t"
+		"mov 2*4(%1), %3\n\t"
+		"mov 3*4(%1), %4\n\t"
+		"mov  %3, 2*4(%2)\n\t"
+		"mov  %4, 3*4(%2)\n\t"
+		"lea  0x10(%1), %1\n\t"
+		"lea  0x10(%2), %2\n\t"
+		"jae 3b\n\t"
+		"add $0x10, %0\n\t"
+		"jmp 1f\n\t"
+
+		/* Handle data forward by movs */
+		".p2align 4\n\t"
+		"4:\n\t"
+		"mov -4(%1, %0), %3\n\t"
+		"lea -4(%2, %0), %4\n\t"
+		"shr $2, %0\n\t"
+		"rep movsl\n\t"
+		"mov %3, (%4)\n\t"
+		"jmp 11f\n\t"
+		/* Handle data backward by movs */
+		".p2align 4\n\t"
+		"6:\n\t"
+		"mov (%1), %3\n\t"
+		"mov %2, %4\n\t"
+		"lea -4(%1, %0), %1\n\t"
+		"lea -4(%2, %0), %2\n\t"
+		"shr $2, %0\n\t"
+		"std\n\t"
+		"rep movsl\n\t"
+		"mov %3,(%4)\n\t"
+		"cld\n\t"
+		"jmp 11f\n\t"
+
+		/* Start to prepare for backward copy */
+		".p2align 4\n\t"
+		"2:\n\t"
+		"cmp  $680, %0\n\t"
+		"jb 5f\n\t"
+		"mov %1, %3\n\t"
+		"xor %2, %3\n\t"
+		"and $0xff, %3\n\t"
+		"jz 6b\n\t"
+
+		/* Calculate copy position to tail */
+		"5:\n\t"
+		"add %0, %1\n\t"
+		"add %0, %2\n\t"
+		"sub $0x10, %0\n\t"
+
+		/* We gobble 16 bytes backward in each loop */
+		"7:\n\t"
+		"sub $0x10, %0\n\t"
+
+		"mov -1*4(%1), %3\n\t"
+		"mov -2*4(%1), %4\n\t"
+		"mov  %3, -1*4(%2)\n\t"
+		"mov  %4, -2*4(%2)\n\t"
+		"mov -3*4(%1), %3\n\t"
+		"mov -4*4(%1), %4\n\t"
+		"mov  %3, -3*4(%2)\n\t"
+		"mov  %4, -4*4(%2)\n\t"
+		"lea  -0x10(%1), %1\n\t"
+		"lea  -0x10(%2), %2\n\t"
+		"jae 7b\n\t"
+		/* Calculate copy position to head */
+		"add $0x10, %0\n\t"
+		"sub %0, %1\n\t"
+		"sub %0, %2\n\t"
+
+		/* Move data from 8 bytes to 15 bytes */
+		".p2align 4\n\t"
+		"1:\n\t"
+		"cmp $8, %0\n\t"
+		"jb 8f\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov 1*4(%1), %4\n\t"
+		"mov -2*4(%1, %0), %5\n\t"
+		"mov -1*4(%1, %0), %1\n\t"
+
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, 1*4(%2)\n\t"
+		"mov  %5, -2*4(%2, %0)\n\t"
+		"mov  %1, -1*4(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/* Move data from 4 bytes to 7 bytes */
+		".p2align 4\n\t"
+		"8:\n\t"
+		"cmp $4, %0\n\t"
+		"jb 9f\n\t"
+		"mov 0*4(%1), %3\n\t"
+		"mov -1*4(%1, %0), %4\n\t"
+		"mov  %3, 0*4(%2)\n\t"
+		"mov  %4, -1*4(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/* Move data from 2 bytes to 3 bytes */
+		".p2align 4\n\t"
+		"9:\n\t"
+		"cmp $2, %0\n\t"
+		"jb 10f\n\t"
+		"movw 0*2(%1), %%dx\n\t"
+		"movw -1*2(%1, %0), %%bx\n\t"
+		"movw %%dx, 0*2(%2)\n\t"
+		"movw %%bx, -1*2(%2, %0)\n\t"
+		"jmp 11f\n\t"
+
+		/* Move data for 1 byte */
+		".p2align 4\n\t"
+		"10:\n\t"
+		"cmp $1, %0\n\t"
+		"jb 11f\n\t"
+		"movb (%1), %%cl\n\t"
+		"movb %%cl, (%2)\n\t"
+		".p2align 4\n\t"
+		"11:"
+		: "=&c" (d0), "=&S" (d1), "=&D" (d2),
+		  "=r" (d3), "=r" (d4), "=r"(d5)
+		: "0" (n),
+		 "1" (src),
+		 "2" (dest)
+		: "memory");
+
+	return ret;
+}