diff mbox

[U-Boot,V2] arm: Use optimized memcpy and memset from linux

Message ID 1296038757-11800-1-git-send-email-weisserm@arcor.de
State Accepted
Commit d8834a1323af72f6145bc81adadd75185ef6065f
Delegated to: Albert ARIBAUD
Headers show

Commit Message

Matthias Weisser Jan. 26, 2011, 10:45 a.m. UTC
Using optimized versions of memset and memcpy from linux brings a quite
noticeable speed (x2 or better) improvement for these two functions.

Here are some numbers for test done with jadecpu

                           | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)|
                           |        | +patch |        | +patch |
---------------------------+--------+--------+--------+--------+
Reset to prompt            |  438ms |  330ms |  228ms |  120ms |
                           |        |        |        |        |
TFTP a 3MB img             | 4782ms | 3428ms | 3245ms | 2820ms |
                           |        |        |        |        |
FATLOAD USB a 3MB img*     | 8515ms | 8510ms | ------ | ------ |
                           |        |        |        |        |
BOOTM LZO img in RAM       | 3473ms | 3168ms |  592ms |  592ms |
 where CRC is              |  615ms |  615ms |   54ms |   54ms |
 uncompress                | 2460ms | 2462ms |  450ms |  451ms |
 final boot_elf            |  376ms |   68ms |   65ms |   65ms |
                           |        |        |        |        |
BOOTM LZO img in FLASH     | 3207ms | 2902ms | 1050ms | 1050ms |
 where CRC is              |  600ms |  600ms |  135ms |  135ms |
 uncompress                | 2209ms | 2211ms |  828ms |  828ms |
                           |        |        |        |        |
Copy 1.4MB from NOR to RAM |  134ms |   72ms |  120ms |   70ms |

(1) No dcache
(2) dcache enabled in board_init
*Does not work when dcache is on

Size impact:

C version:
   text    data     bss     dec     hex filename
 202862   18912  266456  488230   77326 u-boot

ASM version:
   text    data     bss     dec     hex filename
 203798   18912  266288  488998   77626 u-boot
222712  u-boot.bin

Changes since V1:
  - Made the usage of these functions optional be CONFIG_USE_ARCH_MEM
  - Usage of PLD instruction on all architectures supporting it
  - Added a README entry
  - Minor style fixes

Signed-off-by: Matthias Weisser <weisserm@arcor.de>
---
 README                           |    6 +
 arch/arm/include/asm/assembler.h |   60 ++++++++++
 arch/arm/include/asm/string.h    |   10 ++-
 arch/arm/lib/Makefile            |    2 +
 arch/arm/lib/memcpy.S            |  241 ++++++++++++++++++++++++++++++++++++++
 arch/arm/lib/memset.S            |  126 ++++++++++++++++++++
 6 files changed, 443 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm/include/asm/assembler.h
 create mode 100644 arch/arm/lib/memcpy.S
 create mode 100644 arch/arm/lib/memset.S

Comments

Albert ARIBAUD Jan. 26, 2011, 12:07 p.m. UTC | #1
Hi Matthias,

Le 26/01/2011 11:45, Matthias Weisser a écrit :
> Using optimized versions of memset and memcpy from linux brings a quite
> noticeable speed (x2 or better) improvement for these two functions.
>
> Here are some numbers for test done with jadecpu
>
>                             | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)|
>                             |        | +patch |        | +patch |
> ---------------------------+--------+--------+--------+--------+
> Reset to prompt            |  438ms |  330ms |  228ms |  120ms |
>                             |        |        |        |        |
> TFTP a 3MB img             | 4782ms | 3428ms | 3245ms | 2820ms |
>                             |        |        |        |        |
> FATLOAD USB a 3MB img*     | 8515ms | 8510ms | ------ | ------ |
>                             |        |        |        |        |
> BOOTM LZO img in RAM       | 3473ms | 3168ms |  592ms |  592ms |
>   where CRC is              |  615ms |  615ms |   54ms |   54ms |
>   uncompress                | 2460ms | 2462ms |  450ms |  451ms |
>   final boot_elf            |  376ms |   68ms |   65ms |   65ms |
>                             |        |        |        |        |
> BOOTM LZO img in FLASH     | 3207ms | 2902ms | 1050ms | 1050ms |
>   where CRC is              |  600ms |  600ms |  135ms |  135ms |
>   uncompress                | 2209ms | 2211ms |  828ms |  828ms |
>                             |        |        |        |        |
> Copy 1.4MB from NOR to RAM |  134ms |   72ms |  120ms |   70ms |
>
> (1) No dcache
> (2) dcache enabled in board_init
> *Does not work when dcache is on
>
> Size impact:
>
> C version:
>     text    data     bss     dec     hex filename
>   202862   18912  266456  488230   77326 u-boot
>
> ASM version:
>     text    data     bss     dec     hex filename
>   203798   18912  266288  488998   77626 u-boot
> 222712  u-boot.bin
>
> Changes since V1:
>    - Made the usage of these functions optional be CONFIG_USE_ARCH_MEM
>    - Usage of PLD instruction on all architectures supporting it
>    - Added a README entry
>    - Minor style fixes
>
> Signed-off-by: Matthias Weisser<weisserm@arcor.de>
> ---

IIRC, the '---' line separates patch commit message (above) from 
freeform comments and history (below). Here, at least the version 
history should move below the '---' line.

Also, I think that above the line, /some/ indication of performance 
enhancement and drawbacks should be given, but not a full ASCII table of 
numbers -- that can go below the line.

>   README                           |    6 +
>   arch/arm/include/asm/assembler.h |   60 ++++++++++
>   arch/arm/include/asm/string.h    |   10 ++-
>   arch/arm/lib/Makefile            |    2 +
>   arch/arm/lib/memcpy.S            |  241 ++++++++++++++++++++++++++++++++++++++
>   arch/arm/lib/memset.S            |  126 ++++++++++++++++++++
>   6 files changed, 443 insertions(+), 2 deletions(-)
>   create mode 100644 arch/arm/include/asm/assembler.h
>   create mode 100644 arch/arm/lib/memcpy.S
>   create mode 100644 arch/arm/lib/memset.S
>
> diff --git a/README b/README
> index 755d17c..5c610f2 100644
> --- a/README
> +++ b/README
> @@ -2885,6 +2885,12 @@ Low Level (hardware related) configuration options:
>   		that is executed before the actual U-Boot. E.g. when
>   		compiling a NAND SPL.
>
> +- CONFIG_USE_ARCH_MEMCPY
> +  CONFIG_USE_ARCH_MEMSET
> +		If these options are used a optimized version of memcpy/memset will
> +		be used if available. These functions may be faster under some
> +		conditions but may increase the binary size.
> +

The name of the options is not self-explaining to me. If the difference 
is "generic vs arch-optimal", then maybe CONFIG_USE_ARCH_OPTIMAL_MEMxxx 
would be a better name?

>   Building the Software:
>   ======================
>
> diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
> new file mode 100644
> index 0000000..418ee94
> --- /dev/null
> +++ b/arch/arm/include/asm/assembler.h
> @@ -0,0 +1,60 @@
> +/*
> + *  arch/arm/include/asm/assembler.h
> + *
> + *  Copyright (C) 1996-2000 Russell King
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + *  This file contains arm architecture specific defines
> + *  for the different processors.
> + *
> + *  Do not include any C declarations in this file - it is included by
> + *  assembler source.
> + */
> +
> +/*
> + * Endian independent macros for shifting bytes within registers.
> + */
> +#ifndef __ARMEB__
> +#define pull		lsr
> +#define push		lsl
> +#define get_byte_0	lsl #0
> +#define get_byte_1	lsr #8
> +#define get_byte_2	lsr #16
> +#define get_byte_3	lsr #24
> +#define put_byte_0	lsl #0
> +#define put_byte_1	lsl #8
> +#define put_byte_2	lsl #16
> +#define put_byte_3	lsl #24
> +#else
> +#define pull		lsl
> +#define push		lsr
> +#define get_byte_0	lsr #24
> +#define get_byte_1	lsr #16
> +#define get_byte_2	lsr #8
> +#define get_byte_3      lsl #0
> +#define put_byte_0	lsl #24
> +#define put_byte_1	lsl #16
> +#define put_byte_2	lsl #8
> +#define put_byte_3      lsl #0
> +#endif
> +
> +/*
> + * Data preload for architectures that support it
> + */
> +#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \
> +	defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
> +	defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || \
> +	defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_7A__) || \
> +	defined(__ARM_ARCH_7R__)
> +#define PLD(code...)	code
> +#else
> +#define PLD(code...)
> +#endif
> +
> +/*
> + * Cache alligned
> + */
> +#define CALGN(code...) code
> diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
> index c3ea582..c6dfb25 100644
> --- a/arch/arm/include/asm/string.h
> +++ b/arch/arm/include/asm/string.h
> @@ -1,6 +1,8 @@
>   #ifndef __ASM_ARM_STRING_H
>   #define __ASM_ARM_STRING_H
>
> +#include<config.h>
> +
>   /*
>    * We don't do inline string functions, since the
>    * optimised inline asm versions are not small.
> @@ -12,7 +14,9 @@ extern char * strrchr(const char * s, int c);
>   #undef __HAVE_ARCH_STRCHR
>   extern char * strchr(const char * s, int c);
>
> -#undef __HAVE_ARCH_MEMCPY
> +#ifdef CONFIG_USE_ARCH_MEMCPY
> +#define __HAVE_ARCH_MEMCPY
> +#endif
>   extern void * memcpy(void *, const void *, __kernel_size_t);
>
>   #undef __HAVE_ARCH_MEMMOVE
> @@ -22,7 +26,9 @@ extern void * memmove(void *, const void *, __kernel_size_t);
>   extern void * memchr(const void *, int, __kernel_size_t);
>
>   #undef __HAVE_ARCH_MEMZERO
> -#undef __HAVE_ARCH_MEMSET
> +#ifdef CONFIG_USE_ARCH_MEMSET
> +#define __HAVE_ARCH_MEMSET
> +#endif
>   extern void * memset(void *, int, __kernel_size_t);
>
>   #if 0
> diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
> index 454440c..03b1b5e 100644
> --- a/arch/arm/lib/Makefile
> +++ b/arch/arm/lib/Makefile
> @@ -44,6 +44,8 @@ COBJS-y	+= cache-cp15.o
>   endif
>   COBJS-y	+= interrupts.o
>   COBJS-y	+= reset.o
> +SOBJS-$(CONFIG_USE_ARCH_MEMSET) += memset.o
> +SOBJS-$(CONFIG_USE_ARCH_MEMCPY) += memcpy.o
>
>   SRCS	:= $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \
>   	   $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c)
> diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
> new file mode 100644
> index 0000000..40db90e
> --- /dev/null
> +++ b/arch/arm/lib/memcpy.S
> @@ -0,0 +1,241 @@
> +/*
> + *  linux/arch/arm/lib/memcpy.S
> + *
> + *  Author:	Nicolas Pitre
> + *  Created:	Sep 28, 2005
> + *  Copyright:	MontaVista Software, Inc.
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License version 2 as
> + *  published by the Free Software Foundation.
> + */
> +
> +#include<asm/assembler.h>
> +
> +#define W(instr)	instr
> +
> +#define LDR1W_SHIFT	0
> +#define STR1W_SHIFT	0
> +
> +	.macro ldr1w ptr reg abort
> +	W(ldr) \reg, [\ptr], #4
> +	.endm
> +
> +	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
> +	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
> +	.endm
> +
> +	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> +	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
> +	.endm
> +
> +	.macro ldr1b ptr reg cond=al abort
> +	ldr\cond\()b \reg, [\ptr], #1
> +	.endm
> +
> +	.macro str1w ptr reg abort
> +	W(str) \reg, [\ptr], #4
> +	.endm
> +
> +	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
> +	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
> +	.endm
> +
> +	.macro str1b ptr reg cond=al abort
> +	str\cond\()b \reg, [\ptr], #1
> +	.endm
> +
> +	.macro enter reg1 reg2
> +	stmdb sp!, {r0, \reg1, \reg2}
> +	.endm
> +
> +	.macro exit reg1 reg2
> +	ldmfd sp!, {r0, \reg1, \reg2}
> +	.endm
> +
> +	.text
> +
> +/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
> +
> +.globl memcpy
> +memcpy:
> +
> +		enter	r4, lr
> +
> +		subs	r2, r2, #4
> +		blt	8f
> +		ands	ip, r0, #3
> +	PLD(	pld	[r1, #0]		)
> +		bne	9f
> +		ands	ip, r1, #3
> +		bne	10f
> +
> +1:		subs	r2, r2, #(28)
> +		stmfd	sp!, {r5 - r8}
> +		blt	5f
> +
> +	CALGN(	ands	ip, r0, #31		)
> +	CALGN(	rsb	r3, ip, #32		)
> +	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
> +	CALGN(	bcs	2f			)
> +	CALGN(	adr	r4, 6f			)
> +	CALGN(	subs	r2, r2, r3		)  @ C gets set
> +	CALGN(	add	pc, r4, ip		)
> +
> +	PLD(	pld	[r1, #0]		)
> +2:	PLD(	subs	r2, r2, #96		)
> +	PLD(	pld	[r1, #28]		)
> +	PLD(	blt	4f			)
> +	PLD(	pld	[r1, #60]		)
> +	PLD(	pld	[r1, #92]		)
> +
> +3:	PLD(	pld	[r1, #124]		)
> +4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		subs	r2, r2, #32
> +		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		bge	3b
> +	PLD(	cmn	r2, #96			)
> +	PLD(	bge	4b			)
> +
> +5:		ands	ip, r2, #28
> +		rsb	ip, ip, #32
> +#if LDR1W_SHIFT>  0
> +		lsl	ip, ip, #LDR1W_SHIFT
> +#endif
> +		addne	pc, pc, ip		@ C is always clear here
> +		b	7f
> +6:
> +		.rept	(1<<  LDR1W_SHIFT)
> +		W(nop)
> +		.endr
> +		ldr1w	r1, r3, abort=20f
> +		ldr1w	r1, r4, abort=20f
> +		ldr1w	r1, r5, abort=20f
> +		ldr1w	r1, r6, abort=20f
> +		ldr1w	r1, r7, abort=20f
> +		ldr1w	r1, r8, abort=20f
> +		ldr1w	r1, lr, abort=20f
> +
> +#if LDR1W_SHIFT<  STR1W_SHIFT
> +		lsl	ip, ip, #STR1W_SHIFT - LDR1W_SHIFT
> +#elif LDR1W_SHIFT>  STR1W_SHIFT
> +		lsr	ip, ip, #LDR1W_SHIFT - STR1W_SHIFT
> +#endif
> +		add	pc, pc, ip
> +		nop
> +		.rept	(1<<  STR1W_SHIFT)
> +		W(nop)
> +		.endr
> +		str1w	r0, r3, abort=20f
> +		str1w	r0, r4, abort=20f
> +		str1w	r0, r5, abort=20f
> +		str1w	r0, r6, abort=20f
> +		str1w	r0, r7, abort=20f
> +		str1w	r0, r8, abort=20f
> +		str1w	r0, lr, abort=20f
> +
> +	CALGN(	bcs	2b			)
> +
> +7:		ldmfd	sp!, {r5 - r8}
> +
> +8:		movs	r2, r2, lsl #31
> +		ldr1b	r1, r3, ne, abort=21f
> +		ldr1b	r1, r4, cs, abort=21f
> +		ldr1b	r1, ip, cs, abort=21f
> +		str1b	r0, r3, ne, abort=21f
> +		str1b	r0, r4, cs, abort=21f
> +		str1b	r0, ip, cs, abort=21f
> +
> +		exit	r4, pc
> +
> +9:		rsb	ip, ip, #4
> +		cmp	ip, #2
> +		ldr1b	r1, r3, gt, abort=21f
> +		ldr1b	r1, r4, ge, abort=21f
> +		ldr1b	r1, lr, abort=21f
> +		str1b	r0, r3, gt, abort=21f
> +		str1b	r0, r4, ge, abort=21f
> +		subs	r2, r2, ip
> +		str1b	r0, lr, abort=21f
> +		blt	8b
> +		ands	ip, r1, #3
> +		beq	1b
> +
> +10:		bic	r1, r1, #3
> +		cmp	ip, #2
> +		ldr1w	r1, lr, abort=21f
> +		beq	17f
> +		bgt	18f
> +
> +
> +		.macro	forward_copy_shift pull push
> +
> +		subs	r2, r2, #28
> +		blt	14f
> +
> +	CALGN(	ands	ip, r0, #31		)
> +	CALGN(	rsb	ip, ip, #32		)
> +	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
> +	CALGN(	subcc	r2, r2, ip		)
> +	CALGN(	bcc	15f			)
> +
> +11:		stmfd	sp!, {r5 - r9}
> +
> +	PLD(	pld	[r1, #0]		)
> +	PLD(	subs	r2, r2, #96		)
> +	PLD(	pld	[r1, #28]		)
> +	PLD(	blt	13f			)
> +	PLD(	pld	[r1, #60]		)
> +	PLD(	pld	[r1, #92]		)
> +
> +12:	PLD(	pld	[r1, #124]		)
> +13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
> +		mov	r3, lr, pull #\pull
> +		subs	r2, r2, #32
> +		ldr4w	r1, r8, r9, ip, lr, abort=19f
> +		orr	r3, r3, r4, push #\push
> +		mov	r4, r4, pull #\pull
> +		orr	r4, r4, r5, push #\push
> +		mov	r5, r5, pull #\pull
> +		orr	r5, r5, r6, push #\push
> +		mov	r6, r6, pull #\pull
> +		orr	r6, r6, r7, push #\push
> +		mov	r7, r7, pull #\pull
> +		orr	r7, r7, r8, push #\push
> +		mov	r8, r8, pull #\pull
> +		orr	r8, r8, r9, push #\push
> +		mov	r9, r9, pull #\pull
> +		orr	r9, r9, ip, push #\push
> +		mov	ip, ip, pull #\pull
> +		orr	ip, ip, lr, push #\push
> +		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
> +		bge	12b
> +	PLD(	cmn	r2, #96			)
> +	PLD(	bge	13b			)
> +
> +		ldmfd	sp!, {r5 - r9}
> +
> +14:		ands	ip, r2, #28
> +		beq	16f
> +
> +15:		mov	r3, lr, pull #\pull
> +		ldr1w	r1, lr, abort=21f
> +		subs	ip, ip, #4
> +		orr	r3, r3, lr, push #\push
> +		str1w	r0, r3, abort=21f
> +		bgt	15b
> +	CALGN(	cmp	r2, #0			)
> +	CALGN(	bge	11b			)
> +
> +16:		sub	r1, r1, #(\push / 8)
> +		b	8b
> +
> +		.endm
> +
> +
> +		forward_copy_shift	pull=8	push=24
> +
> +17:		forward_copy_shift	pull=16	push=16
> +
> +18:		forward_copy_shift	pull=24	push=8
> +
> diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
> new file mode 100644
> index 0000000..0cdf895
> --- /dev/null
> +++ b/arch/arm/lib/memset.S
> @@ -0,0 +1,126 @@
> +/*
> + *  linux/arch/arm/lib/memset.S
> + *
> + *  Copyright (C) 1995-2000 Russell King
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + *  ASM optimised string functions
> + */
> +#include<asm/assembler.h>
> +
> +	.text
> +	.align	5
> +	.word	0
> +
> +1:	subs	r2, r2, #4		@ 1 do we have enough
> +	blt	5f			@ 1 bytes to align with?
> +	cmp	r3, #2			@ 1
> +	strltb	r1, [r0], #1		@ 1
> +	strleb	r1, [r0], #1		@ 1
> +	strb	r1, [r0], #1		@ 1
> +	add	r2, r2, r3		@ 1 (r2 = r2 - (4 - r3))
> +/*
> + * The pointer is now aligned and the length is adjusted.  Try doing the
> + * memset again.
> + */
> +
> +.globl memset
> +memset:
> +	ands	r3, r0, #3		@ 1 unaligned?
> +	bne	1b			@ 1
> +/*
> + * we know that the pointer in r0 is aligned to a word boundary.
> + */
> +	orr	r1, r1, r1, lsl #8
> +	orr	r1, r1, r1, lsl #16
> +	mov	r3, r1
> +	cmp	r2, #16
> +	blt	4f
> +
> +#if ! CALGN(1)+0
> +
> +/*
> + * We need an extra register for this loop - save the return address and
> + * use the LR
> + */
> +	str	lr, [sp, #-4]!
> +	mov	ip, r1
> +	mov	lr, r1
> +
> +2:	subs	r2, r2, #64
> +	stmgeia	r0!, {r1, r3, ip, lr}	@ 64 bytes at a time.
> +	stmgeia	r0!, {r1, r3, ip, lr}
> +	stmgeia	r0!, {r1, r3, ip, lr}
> +	stmgeia	r0!, {r1, r3, ip, lr}
> +	bgt	2b
> +	ldmeqfd	sp!, {pc}		@ Now<64 bytes to go.
> +/*
> + * No need to correct the count; we're only testing bits from now on
> + */
> +	tst	r2, #32
> +	stmneia	r0!, {r1, r3, ip, lr}
> +	stmneia	r0!, {r1, r3, ip, lr}
> +	tst	r2, #16
> +	stmneia	r0!, {r1, r3, ip, lr}
> +	ldr	lr, [sp], #4
> +
> +#else
> +
> +/*
> + * This version aligns the destination pointer in order to write
> + * whole cache lines at once.
> + */
> +
> +	stmfd	sp!, {r4-r7, lr}
> +	mov	r4, r1
> +	mov	r5, r1
> +	mov	r6, r1
> +	mov	r7, r1
> +	mov	ip, r1
> +	mov	lr, r1
> +
> +	cmp	r2, #96
> +	tstgt	r0, #31
> +	ble	3f
> +
> +	and	ip, r0, #31
> +	rsb	ip, ip, #32
> +	sub	r2, r2, ip
> +	movs	ip, ip, lsl #(32 - 4)
> +	stmcsia	r0!, {r4, r5, r6, r7}
> +	stmmiia	r0!, {r4, r5}
> +	tst	ip, #(1<<  30)
> +	mov	ip, r1
> +	strne	r1, [r0], #4
> +
> +3:	subs	r2, r2, #64
> +	stmgeia	r0!, {r1, r3-r7, ip, lr}
> +	stmgeia	r0!, {r1, r3-r7, ip, lr}
> +	bgt	3b
> +	ldmeqfd	sp!, {r4-r7, pc}
> +
> +	tst	r2, #32
> +	stmneia	r0!, {r1, r3-r7, ip, lr}
> +	tst	r2, #16
> +	stmneia	r0!, {r4-r7}
> +	ldmfd	sp!, {r4-r7, lr}
> +
> +#endif
> +
> +4:	tst	r2, #8
> +	stmneia	r0!, {r1, r3}
> +	tst	r2, #4
> +	strne	r1, [r0], #4
> +/*
> + * When we get here, we've got less than 4 bytes to zero.  We
> + * may have an unaligned pointer as well.
> + */
> +5:	tst	r2, #2
> +	strneb	r1, [r0], #1
> +	strneb	r1, [r0], #1
> +	tst	r2, #1
> +	strneb	r1, [r0], #1
> +	mov	pc, lr

Amicalement,
Matthias Weisser Jan. 26, 2011, 12:50 p.m. UTC | #2
Am 26.01.2011 13:07, schrieb Albert ARIBAUD:
>> ---
>
> IIRC, the '---' line separates patch commit message (above) from
> freeform comments and history (below). Here, at least the version
> history should move below the '---' line.

Wolfgang asked me that I add the numbers to the commit message. For the 
changelog I will investigate the git commands on how to do that best 
without manually editing the patch file before git send-email them.

>> +- CONFIG_USE_ARCH_MEMCPY
>> + CONFIG_USE_ARCH_MEMSET
>> + If these options are used a optimized version of memcpy/memset will
>> + be used if available. These functions may be faster under some
>> + conditions but may increase the binary size.
>> +
>
> The name of the options is not self-explaining to me. If the difference
> is "generic vs arch-optimal", then maybe CONFIG_USE_ARCH_OPTIMAL_MEMxxx
> would be a better name?

Wolfgang didn't object on these names. If we use the OPTIMAL form it is 
still not clear what optimal mean. There may be a size optimized version 
and a speed optimized version. So we would need 
CONFIG_USE_ARCH_SPEED_OPTIMAL_MEMxxx which I personally dislike a lot as 
it is quite long. I also think that if there is an architecture specific 
function that it should be clear that this is optimal in some way.

Thanks for review

Regards
Matthias
Wolfgang Denk Jan. 26, 2011, 1:07 p.m. UTC | #3
Dear =?ISO-8859-15?Q?Matthias_Wei=DFer?=,

In message <4D4018AD.7090001@arcor.de> you wrote:
>
> > IIRC, the '---' line separates patch commit message (above) from
> > freeform comments and history (below). Here, at least the version
> > history should move below the '---' line.
> 
> Wolfgang asked me that I add the numbers to the commit message. For the 
> changelog I will investigate the git commands on how to do that best 
> without manually editing the patch file before git send-email them.

Indeed I find that these numbers are information that should go into
the commit message so this data is available to users who have to
decide whether they want to trade the increased speed for the
increased memory footprint.

> >> +- CONFIG_USE_ARCH_MEMCPY
> >> + CONFIG_USE_ARCH_MEMSET
> >> + If these options are used a optimized version of memcpy/memset will
> >> + be used if available. These functions may be faster under some
> >> + conditions but may increase the binary size.
> >> +
> >
> > The name of the options is not self-explaining to me. If the difference
> > is "generic vs arch-optimal", then maybe CONFIG_USE_ARCH_OPTIMAL_MEMxxx
> > would be a better name?
> 
> Wolfgang didn't object on these names. If we use the OPTIMAL form it is 
> still not clear what optimal mean. There may be a size optimized version 
> and a speed optimized version. So we would need 
> CONFIG_USE_ARCH_SPEED_OPTIMAL_MEMxxx which I personally dislike a lot as 
> it is quite long. I also think that if there is an architecture specific 
> function that it should be clear that this is optimal in some way.

Well, "optimal" is not a good idea as I am pretty sure that some
clever person will still be able to spare some cycles here and there,
so his code would be even "more optimal" ;-)

I think the names CONFIG_USE_ARCH_MEMCPY etc. are actually pretty
good, because they are in line with the standard names
__HAVE_ARCH_MEMCPY etc. that are used in a lot of libraries.

Best regards,

Wolfgang Denk
Albert ARIBAUD Jan. 27, 2011, 6:39 p.m. UTC | #4
Hi Wolfgang,

Le 26/01/2011 14:07, Wolfgang Denk a écrit :
> Dear =?ISO-8859-15?Q?Matthias_Wei=DFer?=,
>
> In message<4D4018AD.7090001@arcor.de>  you wrote:
>>
>>> IIRC, the '---' line separates patch commit message (above) from
>>> freeform comments and history (below). Here, at least the version
>>> history should move below the '---' line.
>>
>> Wolfgang asked me that I add the numbers to the commit message. For the
>> changelog I will investigate the git commands on how to do that best
>> without manually editing the patch file before git send-email them.
>
> Indeed I find that these numbers are information that should go into
> the commit message so this data is available to users who have to
> decide whether they want to trade the increased speed for the
> increased memory footprint.

Can't we have thses numbers in a more compact form then? That makes a 
really big commit message.

>>>> +- CONFIG_USE_ARCH_MEMCPY
>>>> + CONFIG_USE_ARCH_MEMSET
>>>> + If these options are used a optimized version of memcpy/memset will
>>>> + be used if available. These functions may be faster under some
>>>> + conditions but may increase the binary size.
>>>> +
>>>
>>> The name of the options is not self-explaining to me. If the difference
>>> is "generic vs arch-optimal", then maybe CONFIG_USE_ARCH_OPTIMAL_MEMxxx
>>> would be a better name?
>>
>> Wolfgang didn't object on these names. If we use the OPTIMAL form it is
>> still not clear what optimal mean. There may be a size optimized version
>> and a speed optimized version. So we would need
>> CONFIG_USE_ARCH_SPEED_OPTIMAL_MEMxxx which I personally dislike a lot as
>> it is quite long. I also think that if there is an architecture specific
>> function that it should be clear that this is optimal in some way.
>
> Well, "optimal" is not a good idea as I am pretty sure that some
> clever person will still be able to spare some cycles here and there,
> so his code would be even "more optimal" ;-)

Granted.

> I think the names CONFIG_USE_ARCH_MEMCPY etc. are actually pretty
> good, because they are in line with the standard names
> __HAVE_ARCH_MEMCPY etc. that are used in a lot of libraries.

All right.

> Best regards,
>
> Wolfgang Denk

Amicalement,
Alexander Holler Feb. 20, 2011, 7:35 p.m. UTC | #5
Hello,

Am 26.01.2011 11:45, schrieb Matthias Weisser:
> Using optimized versions of memset and memcpy from linux brings a quite
> noticeable speed (x2 or better) improvement for these two functions.

Thanks, tested on armv5 and armv7, therefor

Tested-by: Alexander Holler <holler@ahsoftware.de>

Regards,

Alexander
Albert ARIBAUD March 3, 2011, 7:07 a.m. UTC | #6
Hi Matthias,

Le 26/01/2011 11:45, Matthias Weisser a écrit :
> Using optimized versions of memset and memcpy from linux brings a quite
> noticeable speed (x2 or better) improvement for these two functions.
>
> Here are some numbers for test done with jadecpu
>
>                             | HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)|
>                             |        | +patch |        | +patch |
> ---------------------------+--------+--------+--------+--------+
> Reset to prompt            |  438ms |  330ms |  228ms |  120ms |
>                             |        |        |        |        |
> TFTP a 3MB img             | 4782ms | 3428ms | 3245ms | 2820ms |
>                             |        |        |        |        |
> FATLOAD USB a 3MB img*     | 8515ms | 8510ms | ------ | ------ |
>                             |        |        |        |        |
> BOOTM LZO img in RAM       | 3473ms | 3168ms |  592ms |  592ms |
>   where CRC is              |  615ms |  615ms |   54ms |   54ms |
>   uncompress                | 2460ms | 2462ms |  450ms |  451ms |
>   final boot_elf            |  376ms |   68ms |   65ms |   65ms |
>                             |        |        |        |        |
> BOOTM LZO img in FLASH     | 3207ms | 2902ms | 1050ms | 1050ms |
>   where CRC is              |  600ms |  600ms |  135ms |  135ms |
>   uncompress                | 2209ms | 2211ms |  828ms |  828ms |
>                             |        |        |        |        |
> Copy 1.4MB from NOR to RAM |  134ms |   72ms |  120ms |   70ms |
>
> (1) No dcache
> (2) dcache enabled in board_init
> *Does not work when dcache is on
>
> Size impact:
>
> C version:
>     text    data     bss     dec     hex filename
>   202862   18912  266456  488230   77326 u-boot
>
> ASM version:
>     text    data     bss     dec     hex filename
>   203798   18912  266288  488998   77626 u-boot
> 222712  u-boot.bin
>
> Changes since V1:
>    - Made the usage of these functions optional be CONFIG_USE_ARCH_MEM
>    - Usage of PLD instruction on all architectures supporting it
>    - Added a README entry
>    - Minor style fixes
>
> Signed-off-by: Matthias Weisser<weisserm@arcor.de>
> ---

Sorry for leaving this patch alone for so long.

Can you please just repost a (rebased) V3 with the history below the cut 
line so that it does not appear in the commit message? As it is not a 
fix but was tested on two different ARM archs, I think I'll add it to 
master rather than next so that it goes into mainline without delay.

Amicalement,
diff mbox

Patch

diff --git a/README b/README
index 755d17c..5c610f2 100644
--- a/README
+++ b/README
@@ -2885,6 +2885,12 @@  Low Level (hardware related) configuration options:
 		that is executed before the actual U-Boot. E.g. when
 		compiling a NAND SPL.
 
+- CONFIG_USE_ARCH_MEMCPY
+  CONFIG_USE_ARCH_MEMSET
+		If these options are used a optimized version of memcpy/memset will
+		be used if available. These functions may be faster under some
+		conditions but may increase the binary size.
+
 Building the Software:
 ======================
 
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
new file mode 100644
index 0000000..418ee94
--- /dev/null
+++ b/arch/arm/include/asm/assembler.h
@@ -0,0 +1,60 @@ 
+/*
+ *  arch/arm/include/asm/assembler.h
+ *
+ *  Copyright (C) 1996-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  This file contains arm architecture specific defines
+ *  for the different processors.
+ *
+ *  Do not include any C declarations in this file - it is included by
+ *  assembler source.
+ */
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull		lsr
+#define push		lsl
+#define get_byte_0	lsl #0
+#define get_byte_1	lsr #8
+#define get_byte_2	lsr #16
+#define get_byte_3	lsr #24
+#define put_byte_0	lsl #0
+#define put_byte_1	lsl #8
+#define put_byte_2	lsl #16
+#define put_byte_3	lsl #24
+#else
+#define pull		lsl
+#define push		lsr
+#define get_byte_0	lsr #24
+#define get_byte_1	lsr #16
+#define get_byte_2	lsr #8
+#define get_byte_3      lsl #0
+#define put_byte_0	lsl #24
+#define put_byte_1	lsl #16
+#define put_byte_2	lsl #8
+#define put_byte_3      lsl #0
+#endif
+
+/*
+ * Data preload for architectures that support it
+ */
+#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \
+	defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
+	defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || \
+	defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_7A__) || \
+	defined(__ARM_ARCH_7R__)
+#define PLD(code...)	code
+#else
+#define PLD(code...)
+#endif
+
+/*
+ * Cache alligned
+ */
+#define CALGN(code...) code
diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
index c3ea582..c6dfb25 100644
--- a/arch/arm/include/asm/string.h
+++ b/arch/arm/include/asm/string.h
@@ -1,6 +1,8 @@ 
 #ifndef __ASM_ARM_STRING_H
 #define __ASM_ARM_STRING_H
 
+#include <config.h>
+
 /*
  * We don't do inline string functions, since the
  * optimised inline asm versions are not small.
@@ -12,7 +14,9 @@  extern char * strrchr(const char * s, int c);
 #undef __HAVE_ARCH_STRCHR
 extern char * strchr(const char * s, int c);
 
-#undef __HAVE_ARCH_MEMCPY
+#ifdef CONFIG_USE_ARCH_MEMCPY
+#define __HAVE_ARCH_MEMCPY
+#endif
 extern void * memcpy(void *, const void *, __kernel_size_t);
 
 #undef __HAVE_ARCH_MEMMOVE
@@ -22,7 +26,9 @@  extern void * memmove(void *, const void *, __kernel_size_t);
 extern void * memchr(const void *, int, __kernel_size_t);
 
 #undef __HAVE_ARCH_MEMZERO
-#undef __HAVE_ARCH_MEMSET
+#ifdef CONFIG_USE_ARCH_MEMSET
+#define __HAVE_ARCH_MEMSET
+#endif
 extern void * memset(void *, int, __kernel_size_t);
 
 #if 0
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 454440c..03b1b5e 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -44,6 +44,8 @@  COBJS-y	+= cache-cp15.o
 endif
 COBJS-y	+= interrupts.o
 COBJS-y	+= reset.o
+SOBJS-$(CONFIG_USE_ARCH_MEMSET) += memset.o
+SOBJS-$(CONFIG_USE_ARCH_MEMCPY) += memcpy.o
 
 SRCS	:= $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \
 	   $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c)
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
new file mode 100644
index 0000000..40db90e
--- /dev/null
+++ b/arch/arm/lib/memcpy.S
@@ -0,0 +1,241 @@ 
+/*
+ *  linux/arch/arm/lib/memcpy.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 28, 2005
+ *  Copyright:	MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <asm/assembler.h>
+
+#define W(instr)	instr
+
+#define LDR1W_SHIFT	0
+#define STR1W_SHIFT	0
+
+	.macro ldr1w ptr reg abort
+	W(ldr) \reg, [\ptr], #4
+	.endm
+
+	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
+	.endm
+
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro ldr1b ptr reg cond=al abort
+	ldr\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro str1w ptr reg abort
+	W(str) \reg, [\ptr], #4
+	.endm
+
+	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro str1b ptr reg cond=al abort
+	str\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro enter reg1 reg2
+	stmdb sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.macro exit reg1 reg2
+	ldmfd sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.text
+
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+
+.globl memcpy
+memcpy:
+
+		enter	r4, lr
+
+		subs	r2, r2, #4
+		blt	8f
+		ands	ip, r0, #3
+	PLD(	pld	[r1, #0]		)
+		bne	9f
+		ands	ip, r1, #3
+		bne	10f
+
+1:		subs	r2, r2, #(28)
+		stmfd	sp!, {r5 - r8}
+		blt	5f
+
+	CALGN(	ands	ip, r0, #31		)
+	CALGN(	rsb	r3, ip, #32		)
+	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
+	CALGN(	bcs	2f			)
+	CALGN(	adr	r4, 6f			)
+	CALGN(	subs	r2, r2, r3		)  @ C gets set
+	CALGN(	add	pc, r4, ip		)
+
+	PLD(	pld	[r1, #0]		)
+2:	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #28]		)
+	PLD(	blt	4f			)
+	PLD(	pld	[r1, #60]		)
+	PLD(	pld	[r1, #92]		)
+
+3:	PLD(	pld	[r1, #124]		)
+4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		subs	r2, r2, #32
+		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		bge	3b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	4b			)
+
+5:		ands	ip, r2, #28
+		rsb	ip, ip, #32
+#if LDR1W_SHIFT > 0
+		lsl	ip, ip, #LDR1W_SHIFT
+#endif
+		addne	pc, pc, ip		@ C is always clear here
+		b	7f
+6:
+		.rept	(1 << LDR1W_SHIFT)
+		W(nop)
+		.endr
+		ldr1w	r1, r3, abort=20f
+		ldr1w	r1, r4, abort=20f
+		ldr1w	r1, r5, abort=20f
+		ldr1w	r1, r6, abort=20f
+		ldr1w	r1, r7, abort=20f
+		ldr1w	r1, r8, abort=20f
+		ldr1w	r1, lr, abort=20f
+
+#if LDR1W_SHIFT < STR1W_SHIFT
+		lsl	ip, ip, #STR1W_SHIFT - LDR1W_SHIFT
+#elif LDR1W_SHIFT > STR1W_SHIFT
+		lsr	ip, ip, #LDR1W_SHIFT - STR1W_SHIFT
+#endif
+		add	pc, pc, ip
+		nop
+		.rept	(1 << STR1W_SHIFT)
+		W(nop)
+		.endr
+		str1w	r0, r3, abort=20f
+		str1w	r0, r4, abort=20f
+		str1w	r0, r5, abort=20f
+		str1w	r0, r6, abort=20f
+		str1w	r0, r7, abort=20f
+		str1w	r0, r8, abort=20f
+		str1w	r0, lr, abort=20f
+
+	CALGN(	bcs	2b			)
+
+7:		ldmfd	sp!, {r5 - r8}
+
+8:		movs	r2, r2, lsl #31
+		ldr1b	r1, r3, ne, abort=21f
+		ldr1b	r1, r4, cs, abort=21f
+		ldr1b	r1, ip, cs, abort=21f
+		str1b	r0, r3, ne, abort=21f
+		str1b	r0, r4, cs, abort=21f
+		str1b	r0, ip, cs, abort=21f
+
+		exit	r4, pc
+
+9:		rsb	ip, ip, #4
+		cmp	ip, #2
+		ldr1b	r1, r3, gt, abort=21f
+		ldr1b	r1, r4, ge, abort=21f
+		ldr1b	r1, lr, abort=21f
+		str1b	r0, r3, gt, abort=21f
+		str1b	r0, r4, ge, abort=21f
+		subs	r2, r2, ip
+		str1b	r0, lr, abort=21f
+		blt	8b
+		ands	ip, r1, #3
+		beq	1b
+
+10:		bic	r1, r1, #3
+		cmp	ip, #2
+		ldr1w	r1, lr, abort=21f
+		beq	17f
+		bgt	18f
+
+
+		.macro	forward_copy_shift pull push
+
+		subs	r2, r2, #28
+		blt	14f
+
+	CALGN(	ands	ip, r0, #31		)
+	CALGN(	rsb	ip, ip, #32		)
+	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+	CALGN(	subcc	r2, r2, ip		)
+	CALGN(	bcc	15f			)
+
+11:		stmfd	sp!, {r5 - r9}
+
+	PLD(	pld	[r1, #0]		)
+	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #28]		)
+	PLD(	blt	13f			)
+	PLD(	pld	[r1, #60]		)
+	PLD(	pld	[r1, #92]		)
+
+12:	PLD(	pld	[r1, #124]		)
+13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
+		mov	r3, lr, pull #\pull
+		subs	r2, r2, #32
+		ldr4w	r1, r8, r9, ip, lr, abort=19f
+		orr	r3, r3, r4, push #\push
+		mov	r4, r4, pull #\pull
+		orr	r4, r4, r5, push #\push
+		mov	r5, r5, pull #\pull
+		orr	r5, r5, r6, push #\push
+		mov	r6, r6, pull #\pull
+		orr	r6, r6, r7, push #\push
+		mov	r7, r7, pull #\pull
+		orr	r7, r7, r8, push #\push
+		mov	r8, r8, pull #\pull
+		orr	r8, r8, r9, push #\push
+		mov	r9, r9, pull #\pull
+		orr	r9, r9, ip, push #\push
+		mov	ip, ip, pull #\pull
+		orr	ip, ip, lr, push #\push
+		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
+		bge	12b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	13b			)
+
+		ldmfd	sp!, {r5 - r9}
+
+14:		ands	ip, r2, #28
+		beq	16f
+
+15:		mov	r3, lr, pull #\pull
+		ldr1w	r1, lr, abort=21f
+		subs	ip, ip, #4
+		orr	r3, r3, lr, push #\push
+		str1w	r0, r3, abort=21f
+		bgt	15b
+	CALGN(	cmp	r2, #0			)
+	CALGN(	bge	11b			)
+
+16:		sub	r1, r1, #(\push / 8)
+		b	8b
+
+		.endm
+
+
+		forward_copy_shift	pull=8	push=24
+
+17:		forward_copy_shift	pull=16	push=16
+
+18:		forward_copy_shift	pull=24	push=8
+
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
new file mode 100644
index 0000000..0cdf895
--- /dev/null
+++ b/arch/arm/lib/memset.S
@@ -0,0 +1,126 @@ 
+/*
+ *  linux/arch/arm/lib/memset.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  ASM optimised string functions
+ */
+#include <asm/assembler.h>
+
+	.text
+	.align	5
+	.word	0
+
+1:	subs	r2, r2, #4		@ 1 do we have enough
+	blt	5f			@ 1 bytes to align with?
+	cmp	r3, #2			@ 1
+	strltb	r1, [r0], #1		@ 1
+	strleb	r1, [r0], #1		@ 1
+	strb	r1, [r0], #1		@ 1
+	add	r2, r2, r3		@ 1 (r2 = r2 - (4 - r3))
+/*
+ * The pointer is now aligned and the length is adjusted.  Try doing the
+ * memset again.
+ */
+
+.globl memset
+memset:
+	ands	r3, r0, #3		@ 1 unaligned?
+	bne	1b			@ 1
+/*
+ * we know that the pointer in r0 is aligned to a word boundary.
+ */
+	orr	r1, r1, r1, lsl #8
+	orr	r1, r1, r1, lsl #16
+	mov	r3, r1
+	cmp	r2, #16
+	blt	4f
+
+#if ! CALGN(1)+0
+
+/*
+ * We need an extra register for this loop - save the return address and
+ * use the LR
+ */
+	str	lr, [sp, #-4]!
+	mov	ip, r1
+	mov	lr, r1
+
+2:	subs	r2, r2, #64
+	stmgeia	r0!, {r1, r3, ip, lr}	@ 64 bytes at a time.
+	stmgeia	r0!, {r1, r3, ip, lr}
+	stmgeia	r0!, {r1, r3, ip, lr}
+	stmgeia	r0!, {r1, r3, ip, lr}
+	bgt	2b
+	ldmeqfd	sp!, {pc}		@ Now <64 bytes to go.
+/*
+ * No need to correct the count; we're only testing bits from now on
+ */
+	tst	r2, #32
+	stmneia	r0!, {r1, r3, ip, lr}
+	stmneia	r0!, {r1, r3, ip, lr}
+	tst	r2, #16
+	stmneia	r0!, {r1, r3, ip, lr}
+	ldr	lr, [sp], #4
+
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+
+	stmfd	sp!, {r4-r7, lr}
+	mov	r4, r1
+	mov	r5, r1
+	mov	r6, r1
+	mov	r7, r1
+	mov	ip, r1
+	mov	lr, r1
+
+	cmp	r2, #96
+	tstgt	r0, #31
+	ble	3f
+
+	and	ip, r0, #31
+	rsb	ip, ip, #32
+	sub	r2, r2, ip
+	movs	ip, ip, lsl #(32 - 4)
+	stmcsia	r0!, {r4, r5, r6, r7}
+	stmmiia	r0!, {r4, r5}
+	tst	ip, #(1 << 30)
+	mov	ip, r1
+	strne	r1, [r0], #4
+
+3:	subs	r2, r2, #64
+	stmgeia	r0!, {r1, r3-r7, ip, lr}
+	stmgeia	r0!, {r1, r3-r7, ip, lr}
+	bgt	3b
+	ldmeqfd	sp!, {r4-r7, pc}
+
+	tst	r2, #32
+	stmneia	r0!, {r1, r3-r7, ip, lr}
+	tst	r2, #16
+	stmneia	r0!, {r4-r7}
+	ldmfd	sp!, {r4-r7, lr}
+
+#endif
+
+4:	tst	r2, #8
+	stmneia	r0!, {r1, r3}
+	tst	r2, #4
+	strne	r1, [r0], #4
+/*
+ * When we get here, we've got less than 4 bytes to zero.  We
+ * may have an unaligned pointer as well.
+ */
+5:	tst	r2, #2
+	strneb	r1, [r0], #1
+	strneb	r1, [r0], #1
+	tst	r2, #1
+	strneb	r1, [r0], #1
+	mov	pc, lr