powerpc/vdso64: inline __get_datapage()
diff mbox series

Message ID 20190821092959.16066-1-santosh@fossix.org
State New
Headers show
Series
  • powerpc/vdso64: inline __get_datapage()
Related show

Checks

Context Check Description
snowpatch_ozlabs/checkpatch fail total: 3 errors, 1 warnings, 0 checks, 126 lines checked
snowpatch_ozlabs/build-pmac32 success Build succeeded
snowpatch_ozlabs/build-ppc64e success Build succeeded
snowpatch_ozlabs/build-ppc64be success Build succeeded
snowpatch_ozlabs/build-ppc64le success Build succeeded
snowpatch_ozlabs/apply_patch success Successfully applied on branch next (c9633332103e55bc73d80d07ead28b95a22a85a3)

Commit Message

Santosh Sivaraj Aug. 21, 2019, 9:29 a.m. UTC
__get_datapage() is only a few instructions to retrieve the
address of the page where the kernel stores data to the VDSO.

By inlining this function into its users, a bl/blr pair and
a mflr/mtlr pair is avoided, plus a few reg moves.

clock-gettime-monotonic: syscall: 514 nsec/call  396 nsec/call
clock-gettime-monotonic:    libc: 25 nsec/call   24 nsec/call
clock-gettime-monotonic:    vdso: 20 nsec/call   20 nsec/call
clock-getres-monotonic: syscall: 347 nsec/call   372 nsec/call
clock-getres-monotonic:    libc: 19 nsec/call    19 nsec/call
clock-getres-monotonic:    vdso: 10 nsec/call    10 nsec/call
clock-gettime-monotonic-coarse: syscall: 511 nsec/call   396 nsec/call
clock-gettime-monotonic-coarse:    libc: 23 nsec/call    21 nsec/call
clock-gettime-monotonic-coarse:    vdso: 15 nsec/call    13 nsec/call
clock-gettime-realtime: syscall: 526 nsec/call   405 nsec/call
clock-gettime-realtime:    libc: 24 nsec/call    23 nsec/call
clock-gettime-realtime:    vdso: 18 nsec/call    18 nsec/call
clock-getres-realtime: syscall: 342 nsec/call    372 nsec/call
clock-getres-realtime:    libc: 19 nsec/call     19 nsec/call
clock-getres-realtime:    vdso: 10 nsec/call     10 nsec/call
clock-gettime-realtime-coarse: syscall: 515 nsec/call    373 nsec/call
clock-gettime-realtime-coarse:    libc: 23 nsec/call     22 nsec/call
clock-gettime-realtime-coarse:    vdso: 14 nsec/call     13 nsec/call

Based on the patch by Christophe Leroy <christophe.leroy@c-s.fr> for vdso32.

Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
---

except for a couple of calls (1 or 2 nsec reduction), there are no
improvements in the call times. Or is 10 nsec the minimum granularity??

So I don't know if its even worth updating vdso64 except to keep vdso32 and
vdso64 equal.


 arch/powerpc/kernel/vdso64/cacheflush.S   | 10 ++++----
 arch/powerpc/kernel/vdso64/datapage.S     | 29 ++++-------------------
 arch/powerpc/kernel/vdso64/datapage.h     | 10 ++++++++
 arch/powerpc/kernel/vdso64/gettimeofday.S |  8 ++++---
 4 files changed, 24 insertions(+), 33 deletions(-)
 create mode 100644 arch/powerpc/kernel/vdso64/datapage.h

Comments

Christophe Leroy Aug. 21, 2019, 9:46 a.m. UTC | #1
Le 21/08/2019 à 11:29, Santosh Sivaraj a écrit :
> __get_datapage() is only a few instructions to retrieve the
> address of the page where the kernel stores data to the VDSO.
> 
> By inlining this function into its users, a bl/blr pair and
> a mflr/mtlr pair is avoided, plus a few reg moves.
> 
> clock-gettime-monotonic: syscall: 514 nsec/call  396 nsec/call
> clock-gettime-monotonic:    libc: 25 nsec/call   24 nsec/call
> clock-gettime-monotonic:    vdso: 20 nsec/call   20 nsec/call
> clock-getres-monotonic: syscall: 347 nsec/call   372 nsec/call
> clock-getres-monotonic:    libc: 19 nsec/call    19 nsec/call
> clock-getres-monotonic:    vdso: 10 nsec/call    10 nsec/call
> clock-gettime-monotonic-coarse: syscall: 511 nsec/call   396 nsec/call
> clock-gettime-monotonic-coarse:    libc: 23 nsec/call    21 nsec/call
> clock-gettime-monotonic-coarse:    vdso: 15 nsec/call    13 nsec/call
> clock-gettime-realtime: syscall: 526 nsec/call   405 nsec/call
> clock-gettime-realtime:    libc: 24 nsec/call    23 nsec/call
> clock-gettime-realtime:    vdso: 18 nsec/call    18 nsec/call
> clock-getres-realtime: syscall: 342 nsec/call    372 nsec/call
> clock-getres-realtime:    libc: 19 nsec/call     19 nsec/call
> clock-getres-realtime:    vdso: 10 nsec/call     10 nsec/call
> clock-gettime-realtime-coarse: syscall: 515 nsec/call    373 nsec/call
> clock-gettime-realtime-coarse:    libc: 23 nsec/call     22 nsec/call
> clock-gettime-realtime-coarse:    vdso: 14 nsec/call     13 nsec/call

I think you should only put the measurements on vdso calls, and only the 
ones that are impacted by the change. For exemple, getres function 
doesn't use __get_datapage so showing it here is pointless.

gettimeofday should be shown there as it uses __get_datapage()


> 
> Based on the patch by Christophe Leroy <christophe.leroy@c-s.fr> for vdso32.
> 
> Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
> ---
> 
> except for a couple of calls (1 or 2 nsec reduction), there are no
> improvements in the call times. Or is 10 nsec the minimum granularity??

Maybe the ones that show no improvements are the ones that don't use 
__get_datapage() at all ...

> 
> So I don't know if its even worth updating vdso64 except to keep vdso32 and
> vdso64 equal.

2ns on a 15ns call is 13% so it is worth it I think.

Christophe


> 
> 
>   arch/powerpc/kernel/vdso64/cacheflush.S   | 10 ++++----
>   arch/powerpc/kernel/vdso64/datapage.S     | 29 ++++-------------------
>   arch/powerpc/kernel/vdso64/datapage.h     | 10 ++++++++
>   arch/powerpc/kernel/vdso64/gettimeofday.S |  8 ++++---
>   4 files changed, 24 insertions(+), 33 deletions(-)
>   create mode 100644 arch/powerpc/kernel/vdso64/datapage.h
> 
> diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S
> index 3f92561a64c4..30e8b0d29bea 100644
> --- a/arch/powerpc/kernel/vdso64/cacheflush.S
> +++ b/arch/powerpc/kernel/vdso64/cacheflush.S
> @@ -10,6 +10,8 @@
>   #include <asm/vdso.h>
>   #include <asm/asm-offsets.h>
>   
> +#include "datapage.h"
> +
>   	.text
>   
>   /*
> @@ -24,14 +26,12 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
>     .cfi_startproc
>   	mflr	r12
>     .cfi_register lr,r12
> -	mr	r11,r3
> -	bl	V_LOCAL_FUNC(__get_datapage)
> +	get_datapage	r11, r0
>   	mtlr	r12
> -	mr	r10,r3
>   
>   	lwz	r7,CFG_DCACHE_BLOCKSZ(r10)
>   	addi	r5,r7,-1
> -	andc	r6,r11,r5		/* round low to line bdy */
> +	andc	r6,r3,r5		/* round low to line bdy */
>   	subf	r8,r6,r4		/* compute length */
>   	add	r8,r8,r5		/* ensure we get enough */
>   	lwz	r9,CFG_DCACHE_LOGBLOCKSZ(r10)
> @@ -48,7 +48,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
>   
>   	lwz	r7,CFG_ICACHE_BLOCKSZ(r10)
>   	addi	r5,r7,-1
> -	andc	r6,r11,r5		/* round low to line bdy */
> +	andc	r6,r3,r5		/* round low to line bdy */
>   	subf	r8,r6,r4		/* compute length */
>   	add	r8,r8,r5
>   	lwz	r9,CFG_ICACHE_LOGBLOCKSZ(r10)
> diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S
> index dc84f5ae3802..8712f57c931c 100644
> --- a/arch/powerpc/kernel/vdso64/datapage.S
> +++ b/arch/powerpc/kernel/vdso64/datapage.S
> @@ -11,34 +11,13 @@
>   #include <asm/unistd.h>
>   #include <asm/vdso.h>
>   
> +#include "datapage.h"
> +
>   	.text
>   .global	__kernel_datapage_offset;
>   __kernel_datapage_offset:
>   	.long	0
>   
> -V_FUNCTION_BEGIN(__get_datapage)
> -  .cfi_startproc
> -	/* We don't want that exposed or overridable as we want other objects
> -	 * to be able to bl directly to here
> -	 */
> -	.protected __get_datapage
> -	.hidden __get_datapage
> -
> -	mflr	r0
> -  .cfi_register lr,r0
> -
> -	bcl	20,31,data_page_branch
> -data_page_branch:
> -	mflr	r3
> -	mtlr	r0
> -	addi	r3, r3, __kernel_datapage_offset-data_page_branch
> -	lwz	r0,0(r3)
> -  .cfi_restore lr
> -	add	r3,r0,r3
> -	blr
> -  .cfi_endproc
> -V_FUNCTION_END(__get_datapage)
> -
>   /*
>    * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
>    *
> @@ -53,7 +32,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
>   	mflr	r12
>     .cfi_register lr,r12
>   	mr	r4,r3
> -	bl	V_LOCAL_FUNC(__get_datapage)
> +	get_datapage	r3, r0
>   	mtlr	r12
>   	addi	r3,r3,CFG_SYSCALL_MAP64
>   	cmpldi	cr0,r4,0
> @@ -75,7 +54,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq)
>     .cfi_startproc
>   	mflr	r12
>     .cfi_register lr,r12
> -	bl	V_LOCAL_FUNC(__get_datapage)
> +	get_datapage	r3, r0
>   	ld	r3,CFG_TB_TICKS_PER_SEC(r3)
>   	mtlr	r12
>   	crclr	cr0*4+so
> diff --git a/arch/powerpc/kernel/vdso64/datapage.h b/arch/powerpc/kernel/vdso64/datapage.h
> new file mode 100644
> index 000000000000..f2f0da0f65f3
> --- /dev/null
> +++ b/arch/powerpc/kernel/vdso64/datapage.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +
> +.macro get_datapage ptr, tmp
> +	bcl	20,31,888f
> +888:
> +	mflr	\ptr
> +	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
> +	lwz	\tmp, 0(\ptr)
> +	add	\ptr, \tmp, \ptr
> +.endm
> diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S
> index 07bfe33fe874..7bcc879392cc 100644
> --- a/arch/powerpc/kernel/vdso64/gettimeofday.S
> +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
> @@ -12,6 +12,8 @@
>   #include <asm/asm-offsets.h>
>   #include <asm/unistd.h>
>   
> +#include "datapage.h"
> +
>   	.text
>   /*
>    * Exact prototype of gettimeofday
> @@ -26,7 +28,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
>   
>   	mr	r11,r3			/* r11 holds tv */
>   	mr	r10,r4			/* r10 holds tz */
> -	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
> +	get_datapage	r3, r0
>   	cmpldi	r11,0			/* check if tv is NULL */
>   	beq	2f
>   	lis	r7,1000000@ha		/* load up USEC_PER_SEC */
> @@ -71,7 +73,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
>   	mflr	r12			/* r12 saves lr */
>     .cfi_register lr,r12
>   	mr	r11,r4			/* r11 saves tp */
> -	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
> +	get_datapage	r3, r0		/* get data page */
>   	lis	r7,NSEC_PER_SEC@h	/* want nanoseconds */
>   	ori	r7,r7,NSEC_PER_SEC@l
>   	beq	cr5,70f
> @@ -218,7 +220,7 @@ V_FUNCTION_BEGIN(__kernel_time)
>     .cfi_register lr,r12
>   
>   	mr	r11,r3			/* r11 holds t */
> -	bl	V_LOCAL_FUNC(__get_datapage)
> +	get_datapage	r3, r0
>   
>   	ld	r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
>   
>
Santosh Sivaraj Aug. 21, 2019, 11:20 a.m. UTC | #2
Christophe Leroy <christophe.leroy@c-s.fr> writes:

> Le 21/08/2019 à 11:29, Santosh Sivaraj a écrit :
>> __get_datapage() is only a few instructions to retrieve the
>> address of the page where the kernel stores data to the VDSO.
>> 
>> By inlining this function into its users, a bl/blr pair and
>> a mflr/mtlr pair is avoided, plus a few reg moves.
>> 
>> clock-gettime-monotonic: syscall: 514 nsec/call  396 nsec/call
>> clock-gettime-monotonic:    libc: 25 nsec/call   24 nsec/call
>> clock-gettime-monotonic:    vdso: 20 nsec/call   20 nsec/call
>> clock-getres-monotonic: syscall: 347 nsec/call   372 nsec/call
>> clock-getres-monotonic:    libc: 19 nsec/call    19 nsec/call
>> clock-getres-monotonic:    vdso: 10 nsec/call    10 nsec/call
>> clock-gettime-monotonic-coarse: syscall: 511 nsec/call   396 nsec/call
>> clock-gettime-monotonic-coarse:    libc: 23 nsec/call    21 nsec/call
>> clock-gettime-monotonic-coarse:    vdso: 15 nsec/call    13 nsec/call
>> clock-gettime-realtime: syscall: 526 nsec/call   405 nsec/call
>> clock-gettime-realtime:    libc: 24 nsec/call    23 nsec/call
>> clock-gettime-realtime:    vdso: 18 nsec/call    18 nsec/call
>> clock-getres-realtime: syscall: 342 nsec/call    372 nsec/call
>> clock-getres-realtime:    libc: 19 nsec/call     19 nsec/call
>> clock-getres-realtime:    vdso: 10 nsec/call     10 nsec/call
>> clock-gettime-realtime-coarse: syscall: 515 nsec/call    373 nsec/call
>> clock-gettime-realtime-coarse:    libc: 23 nsec/call     22 nsec/call
>> clock-gettime-realtime-coarse:    vdso: 14 nsec/call     13 nsec/call
>
> I think you should only put the measurements on vdso calls, and only the 
> ones that are impacted by the change. For exemple, getres function 
> doesn't use __get_datapage so showing it here is pointless.
>
> gettimeofday should be shown there as it uses __get_datapage()
>
>
>> 
>> Based on the patch by Christophe Leroy <christophe.leroy@c-s.fr> for vdso32.
>> 
>> Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
>> ---
>> 
>> except for a couple of calls (1 or 2 nsec reduction), there are no
>> improvements in the call times. Or is 10 nsec the minimum granularity??
>
> Maybe the ones that show no improvements are the ones that don't use 
> __get_datapage() at all ...

Yes makes sense.

>
>> 
>> So I don't know if its even worth updating vdso64 except to keep vdso32 and
>> vdso64 equal.
>
> 2ns on a 15ns call is 13% so it is worth it I think.

true. Since datapage.h is the same for both 32 and 64, may be we should put
it in include/asm.

Thanks,
Santosh
>
> Christophe
>
>
>> 
>> 
>>   arch/powerpc/kernel/vdso64/cacheflush.S   | 10 ++++----
>>   arch/powerpc/kernel/vdso64/datapage.S     | 29 ++++-------------------
>>   arch/powerpc/kernel/vdso64/datapage.h     | 10 ++++++++
>>   arch/powerpc/kernel/vdso64/gettimeofday.S |  8 ++++---
>>   4 files changed, 24 insertions(+), 33 deletions(-)
>>   create mode 100644 arch/powerpc/kernel/vdso64/datapage.h
>> 
>> diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S
>> index 3f92561a64c4..30e8b0d29bea 100644
>> --- a/arch/powerpc/kernel/vdso64/cacheflush.S
>> +++ b/arch/powerpc/kernel/vdso64/cacheflush.S
>> @@ -10,6 +10,8 @@
>>   #include <asm/vdso.h>
>>   #include <asm/asm-offsets.h>
>>   
>> +#include "datapage.h"
>> +
>>   	.text
>>   
>>   /*
>> @@ -24,14 +26,12 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
>>     .cfi_startproc
>>   	mflr	r12
>>     .cfi_register lr,r12
>> -	mr	r11,r3
>> -	bl	V_LOCAL_FUNC(__get_datapage)
>> +	get_datapage	r11, r0
>>   	mtlr	r12
>> -	mr	r10,r3
>>   
>>   	lwz	r7,CFG_DCACHE_BLOCKSZ(r10)
>>   	addi	r5,r7,-1
>> -	andc	r6,r11,r5		/* round low to line bdy */
>> +	andc	r6,r3,r5		/* round low to line bdy */
>>   	subf	r8,r6,r4		/* compute length */
>>   	add	r8,r8,r5		/* ensure we get enough */
>>   	lwz	r9,CFG_DCACHE_LOGBLOCKSZ(r10)
>> @@ -48,7 +48,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
>>   
>>   	lwz	r7,CFG_ICACHE_BLOCKSZ(r10)
>>   	addi	r5,r7,-1
>> -	andc	r6,r11,r5		/* round low to line bdy */
>> +	andc	r6,r3,r5		/* round low to line bdy */
>>   	subf	r8,r6,r4		/* compute length */
>>   	add	r8,r8,r5
>>   	lwz	r9,CFG_ICACHE_LOGBLOCKSZ(r10)
>> diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S
>> index dc84f5ae3802..8712f57c931c 100644
>> --- a/arch/powerpc/kernel/vdso64/datapage.S
>> +++ b/arch/powerpc/kernel/vdso64/datapage.S
>> @@ -11,34 +11,13 @@
>>   #include <asm/unistd.h>
>>   #include <asm/vdso.h>
>>   
>> +#include "datapage.h"
>> +
>>   	.text
>>   .global	__kernel_datapage_offset;
>>   __kernel_datapage_offset:
>>   	.long	0
>>   
>> -V_FUNCTION_BEGIN(__get_datapage)
>> -  .cfi_startproc
>> -	/* We don't want that exposed or overridable as we want other objects
>> -	 * to be able to bl directly to here
>> -	 */
>> -	.protected __get_datapage
>> -	.hidden __get_datapage
>> -
>> -	mflr	r0
>> -  .cfi_register lr,r0
>> -
>> -	bcl	20,31,data_page_branch
>> -data_page_branch:
>> -	mflr	r3
>> -	mtlr	r0
>> -	addi	r3, r3, __kernel_datapage_offset-data_page_branch
>> -	lwz	r0,0(r3)
>> -  .cfi_restore lr
>> -	add	r3,r0,r3
>> -	blr
>> -  .cfi_endproc
>> -V_FUNCTION_END(__get_datapage)
>> -
>>   /*
>>    * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
>>    *
>> @@ -53,7 +32,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
>>   	mflr	r12
>>     .cfi_register lr,r12
>>   	mr	r4,r3
>> -	bl	V_LOCAL_FUNC(__get_datapage)
>> +	get_datapage	r3, r0
>>   	mtlr	r12
>>   	addi	r3,r3,CFG_SYSCALL_MAP64
>>   	cmpldi	cr0,r4,0
>> @@ -75,7 +54,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq)
>>     .cfi_startproc
>>   	mflr	r12
>>     .cfi_register lr,r12
>> -	bl	V_LOCAL_FUNC(__get_datapage)
>> +	get_datapage	r3, r0
>>   	ld	r3,CFG_TB_TICKS_PER_SEC(r3)
>>   	mtlr	r12
>>   	crclr	cr0*4+so
>> diff --git a/arch/powerpc/kernel/vdso64/datapage.h b/arch/powerpc/kernel/vdso64/datapage.h
>> new file mode 100644
>> index 000000000000..f2f0da0f65f3
>> --- /dev/null
>> +++ b/arch/powerpc/kernel/vdso64/datapage.h
>> @@ -0,0 +1,10 @@
>> +/* SPDX-License-Identifier: GPL-2.0-or-later */
>> +
>> +.macro get_datapage ptr, tmp
>> +	bcl	20,31,888f
>> +888:
>> +	mflr	\ptr
>> +	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
>> +	lwz	\tmp, 0(\ptr)
>> +	add	\ptr, \tmp, \ptr
>> +.endm
>> diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S
>> index 07bfe33fe874..7bcc879392cc 100644
>> --- a/arch/powerpc/kernel/vdso64/gettimeofday.S
>> +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
>> @@ -12,6 +12,8 @@
>>   #include <asm/asm-offsets.h>
>>   #include <asm/unistd.h>
>>   
>> +#include "datapage.h"
>> +
>>   	.text
>>   /*
>>    * Exact prototype of gettimeofday
>> @@ -26,7 +28,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
>>   
>>   	mr	r11,r3			/* r11 holds tv */
>>   	mr	r10,r4			/* r10 holds tz */
>> -	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
>> +	get_datapage	r3, r0
>>   	cmpldi	r11,0			/* check if tv is NULL */
>>   	beq	2f
>>   	lis	r7,1000000@ha		/* load up USEC_PER_SEC */
>> @@ -71,7 +73,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
>>   	mflr	r12			/* r12 saves lr */
>>     .cfi_register lr,r12
>>   	mr	r11,r4			/* r11 saves tp */
>> -	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
>> +	get_datapage	r3, r0		/* get data page */
>>   	lis	r7,NSEC_PER_SEC@h	/* want nanoseconds */
>>   	ori	r7,r7,NSEC_PER_SEC@l
>>   	beq	cr5,70f
>> @@ -218,7 +220,7 @@ V_FUNCTION_BEGIN(__kernel_time)
>>     .cfi_register lr,r12
>>   
>>   	mr	r11,r3			/* r11 holds t */
>> -	bl	V_LOCAL_FUNC(__get_datapage)
>> +	get_datapage	r3, r0
>>   
>>   	ld	r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
>>   
>>
Segher Boessenkool Aug. 21, 2019, 11:44 a.m. UTC | #3
Hi!

On Wed, Aug 21, 2019 at 02:59:59PM +0530, Santosh Sivaraj wrote:
> except for a couple of calls (1 or 2 nsec reduction), there are no
> improvements in the call times. Or is 10 nsec the minimum granularity??
> 
> So I don't know if its even worth updating vdso64 except to keep vdso32 and
> vdso64 equal.

Calls are cheap, in principle...  It is the LR stuff that can make it
slower on some cores, and a lot of calling sequence stuff may have
considerable overhead of course.

> +.macro get_datapage ptr, tmp
> +	bcl	20,31,888f
> +888:
> +	mflr	\ptr
> +	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
> +	lwz	\tmp, 0(\ptr)
> +	add	\ptr, \tmp, \ptr
> +.endm

(You can just write that as
	bcl 20,31,$+4
	mflr \ptr
etc.  Useless labels are useless :-) )

One thing you might want to do to improve performance is to do this without
the bcl etc., because you cannot really hide the LR latency of that.  But
that isn't very many ns either...  Superscalar helps, OoO helps, but it is
mostly just that >100MHz helps ;-)


Segher
Christophe Leroy Aug. 21, 2019, 11:50 a.m. UTC | #4
Le 21/08/2019 à 13:44, Segher Boessenkool a écrit :
> Hi!
> 
> On Wed, Aug 21, 2019 at 02:59:59PM +0530, Santosh Sivaraj wrote:
>> except for a couple of calls (1 or 2 nsec reduction), there are no
>> improvements in the call times. Or is 10 nsec the minimum granularity??
>>
>> So I don't know if its even worth updating vdso64 except to keep vdso32 and
>> vdso64 equal.
> 
> Calls are cheap, in principle...  It is the LR stuff that can make it
> slower on some cores, and a lot of calling sequence stuff may have
> considerable overhead of course.

On an 8xx, a taken branch is 2 cycles and a non taken branch in 1 cycle 
(+ the refetch if that was not the anticipate branch).

> 
>> +.macro get_datapage ptr, tmp
>> +	bcl	20,31,888f
>> +888:
>> +	mflr	\ptr
>> +	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
>> +	lwz	\tmp, 0(\ptr)
>> +	add	\ptr, \tmp, \ptr
>> +.endm
> 
> (You can just write that as
> 	bcl 20,31,$+4
> 	mflr \ptr
> etc.  Useless labels are useless :-) )

Nice trick. Will use that.

> 
> One thing you might want to do to improve performance is to do this without
> the bcl etc., because you cannot really hide the LR latency of that.  But
> that isn't very many ns either...  Superscalar helps, OoO helps, but it is
> mostly just that >100MHz helps ;-)

Good idea. Did you have a look at my vdso32 similar patch ? 
https://patchwork.ozlabs.org/patch/1148274/

Do you have any idea on how to avoid that bcl/mflr stuff ?

Christophe
Segher Boessenkool Aug. 21, 2019, 12:15 p.m. UTC | #5
On Wed, Aug 21, 2019 at 01:50:52PM +0200, Christophe Leroy wrote:
> Le 21/08/2019 à 13:44, Segher Boessenkool a écrit :
> >Calls are cheap, in principle...  It is the LR stuff that can make it
> >slower on some cores, and a lot of calling sequence stuff may have
> >considerable overhead of course.
> 
> On an 8xx, a taken branch is 2 cycles and a non taken branch in 1 cycle 
> (+ the refetch if that was not the anticipate branch).

Yup.  And on the big cores they are all 0 cycles, if correctly predicted.
(Taken branches end your fetch group, of course, there are small
inefficiencies everywhere, but that's about the gist of it).

> >>+.macro get_datapage ptr, tmp
> >>+	bcl	20,31,888f
> >>+888:
> >>+	mflr	\ptr
> >>+	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
> >>+	lwz	\tmp, 0(\ptr)
> >>+	add	\ptr, \tmp, \ptr
> >>+.endm
> >
> >(You can just write that as
> >	bcl 20,31,$+4
> >	mflr \ptr
> >etc.  Useless labels are useless :-) )
> 
> Nice trick. Will use that.

Or .+4 if you like that syntax better...  It's all the same thing.

> >One thing you might want to do to improve performance is to do this without
> >the bcl etc., because you cannot really hide the LR latency of that.  But
> >that isn't very many ns either...  Superscalar helps, OoO helps, but it is
> >mostly just that >100MHz helps ;-)
> 
> Good idea. Did you have a look at my vdso32 similar patch ? 
> https://patchwork.ozlabs.org/patch/1148274/

Yes, I saw it.

> Do you have any idea on how to avoid that bcl/mflr stuff ?

Do a load from some fixed address?  Maybe an absolute address, even?
lwz r3,-12344(0)  or similar (that address is in kernel space...)

There aren't many options, and certainly not many *good* options!


Segher
Nathan Lynch Aug. 21, 2019, 3:58 p.m. UTC | #6
Christophe Leroy <christophe.leroy@c-s.fr> writes:
> Le 21/08/2019 à 11:29, Santosh Sivaraj a écrit :
>> __get_datapage() is only a few instructions to retrieve the
>> address of the page where the kernel stores data to the VDSO.
>> 
>> By inlining this function into its users, a bl/blr pair and
>> a mflr/mtlr pair is avoided, plus a few reg moves.
>> 
>> clock-gettime-monotonic: syscall: 514 nsec/call  396 nsec/call
>> clock-gettime-monotonic:    libc: 25 nsec/call   24 nsec/call
>> clock-gettime-monotonic:    vdso: 20 nsec/call   20 nsec/call
>> clock-getres-monotonic: syscall: 347 nsec/call   372 nsec/call
>> clock-getres-monotonic:    libc: 19 nsec/call    19 nsec/call
>> clock-getres-monotonic:    vdso: 10 nsec/call    10 nsec/call
>> clock-gettime-monotonic-coarse: syscall: 511 nsec/call   396 nsec/call
>> clock-gettime-monotonic-coarse:    libc: 23 nsec/call    21 nsec/call
>> clock-gettime-monotonic-coarse:    vdso: 15 nsec/call    13 nsec/call
>> clock-gettime-realtime: syscall: 526 nsec/call   405 nsec/call
>> clock-gettime-realtime:    libc: 24 nsec/call    23 nsec/call
>> clock-gettime-realtime:    vdso: 18 nsec/call    18 nsec/call
>> clock-getres-realtime: syscall: 342 nsec/call    372 nsec/call
>> clock-getres-realtime:    libc: 19 nsec/call     19 nsec/call
>> clock-getres-realtime:    vdso: 10 nsec/call     10 nsec/call
>> clock-gettime-realtime-coarse: syscall: 515 nsec/call    373 nsec/call
>> clock-gettime-realtime-coarse:    libc: 23 nsec/call     22 nsec/call
>> clock-gettime-realtime-coarse:    vdso: 14 nsec/call     13 nsec/call
>
> I think you should only put the measurements on vdso calls, and only the 
> ones that are impacted by the change. For exemple, getres function 
> doesn't use __get_datapage so showing it here is pointless.

I agree with this point, but also, I would caution against using
vdsotest's benchmark function for anything like rigorous performance
analysis. The intention was to roughly confirm the VDSO's relative
performance vs the in-kernel implementations. Not to compare one VDSO
implementation of (say) clock_gettime to another.

I suggest using perf to confirm the expected effects of the change, if
possible.
Christophe Leroy Aug. 21, 2019, 4:23 p.m. UTC | #7
Le 21/08/2019 à 14:15, Segher Boessenkool a écrit :
> On Wed, Aug 21, 2019 at 01:50:52PM +0200, Christophe Leroy wrote:
>> Do you have any idea on how to avoid that bcl/mflr stuff ?
> 
> Do a load from some fixed address?  Maybe an absolute address, even?
> lwz r3,-12344(0)  or similar (that address is in kernel space...)
> 
> There aren't many options, and certainly not many *good* options!
> 

IIUC, the VDSO is seen by apps the same way as a dynamic lib. Couldn't 
the relocation be done only once when the app loads the VDSO as for a 
regular .so lib ?

It looks like it is what others do, at least x86 and arm64, unless I 
misunderstood their code.

Christophe
Santosh Sivaraj Aug. 22, 2019, 4:18 p.m. UTC | #8
Christophe Leroy <christophe.leroy@c-s.fr> writes:

> Le 21/08/2019 à 14:15, Segher Boessenkool a écrit :
>> On Wed, Aug 21, 2019 at 01:50:52PM +0200, Christophe Leroy wrote:
>>> Do you have any idea on how to avoid that bcl/mflr stuff ?
>> 
>> Do a load from some fixed address?  Maybe an absolute address, even?
>> lwz r3,-12344(0)  or similar (that address is in kernel space...)
>> 
>> There aren't many options, and certainly not many *good* options!
>> 
>
> IIUC, the VDSO is seen by apps the same way as a dynamic lib. Couldn't 
> the relocation be done only once when the app loads the VDSO as for a 
> regular .so lib ?

How does address space randomization work for .so libs?

>
> It looks like it is what others do, at least x86 and arm64, unless I 
> misunderstood their code.
>
> Christophe

Patch
diff mbox series

diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S
index 3f92561a64c4..30e8b0d29bea 100644
--- a/arch/powerpc/kernel/vdso64/cacheflush.S
+++ b/arch/powerpc/kernel/vdso64/cacheflush.S
@@ -10,6 +10,8 @@ 
 #include <asm/vdso.h>
 #include <asm/asm-offsets.h>
 
+#include "datapage.h"
+
 	.text
 
 /*
@@ -24,14 +26,12 @@  V_FUNCTION_BEGIN(__kernel_sync_dicache)
   .cfi_startproc
 	mflr	r12
   .cfi_register lr,r12
-	mr	r11,r3
-	bl	V_LOCAL_FUNC(__get_datapage)
+	get_datapage	r11, r0
 	mtlr	r12
-	mr	r10,r3
 
 	lwz	r7,CFG_DCACHE_BLOCKSZ(r10)
 	addi	r5,r7,-1
-	andc	r6,r11,r5		/* round low to line bdy */
+	andc	r6,r3,r5		/* round low to line bdy */
 	subf	r8,r6,r4		/* compute length */
 	add	r8,r8,r5		/* ensure we get enough */
 	lwz	r9,CFG_DCACHE_LOGBLOCKSZ(r10)
@@ -48,7 +48,7 @@  V_FUNCTION_BEGIN(__kernel_sync_dicache)
 
 	lwz	r7,CFG_ICACHE_BLOCKSZ(r10)
 	addi	r5,r7,-1
-	andc	r6,r11,r5		/* round low to line bdy */
+	andc	r6,r3,r5		/* round low to line bdy */
 	subf	r8,r6,r4		/* compute length */
 	add	r8,r8,r5
 	lwz	r9,CFG_ICACHE_LOGBLOCKSZ(r10)
diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S
index dc84f5ae3802..8712f57c931c 100644
--- a/arch/powerpc/kernel/vdso64/datapage.S
+++ b/arch/powerpc/kernel/vdso64/datapage.S
@@ -11,34 +11,13 @@ 
 #include <asm/unistd.h>
 #include <asm/vdso.h>
 
+#include "datapage.h"
+
 	.text
 .global	__kernel_datapage_offset;
 __kernel_datapage_offset:
 	.long	0
 
-V_FUNCTION_BEGIN(__get_datapage)
-  .cfi_startproc
-	/* We don't want that exposed or overridable as we want other objects
-	 * to be able to bl directly to here
-	 */
-	.protected __get_datapage
-	.hidden __get_datapage
-
-	mflr	r0
-  .cfi_register lr,r0
-
-	bcl	20,31,data_page_branch
-data_page_branch:
-	mflr	r3
-	mtlr	r0
-	addi	r3, r3, __kernel_datapage_offset-data_page_branch
-	lwz	r0,0(r3)
-  .cfi_restore lr
-	add	r3,r0,r3
-	blr
-  .cfi_endproc
-V_FUNCTION_END(__get_datapage)
-
 /*
  * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
  *
@@ -53,7 +32,7 @@  V_FUNCTION_BEGIN(__kernel_get_syscall_map)
 	mflr	r12
   .cfi_register lr,r12
 	mr	r4,r3
-	bl	V_LOCAL_FUNC(__get_datapage)
+	get_datapage	r3, r0
 	mtlr	r12
 	addi	r3,r3,CFG_SYSCALL_MAP64
 	cmpldi	cr0,r4,0
@@ -75,7 +54,7 @@  V_FUNCTION_BEGIN(__kernel_get_tbfreq)
   .cfi_startproc
 	mflr	r12
   .cfi_register lr,r12
-	bl	V_LOCAL_FUNC(__get_datapage)
+	get_datapage	r3, r0
 	ld	r3,CFG_TB_TICKS_PER_SEC(r3)
 	mtlr	r12
 	crclr	cr0*4+so
diff --git a/arch/powerpc/kernel/vdso64/datapage.h b/arch/powerpc/kernel/vdso64/datapage.h
new file mode 100644
index 000000000000..f2f0da0f65f3
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/datapage.h
@@ -0,0 +1,10 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+.macro get_datapage ptr, tmp
+	bcl	20,31,888f
+888:
+	mflr	\ptr
+	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
+	lwz	\tmp, 0(\ptr)
+	add	\ptr, \tmp, \ptr
+.endm
diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S
index 07bfe33fe874..7bcc879392cc 100644
--- a/arch/powerpc/kernel/vdso64/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
@@ -12,6 +12,8 @@ 
 #include <asm/asm-offsets.h>
 #include <asm/unistd.h>
 
+#include "datapage.h"
+
 	.text
 /*
  * Exact prototype of gettimeofday
@@ -26,7 +28,7 @@  V_FUNCTION_BEGIN(__kernel_gettimeofday)
 
 	mr	r11,r3			/* r11 holds tv */
 	mr	r10,r4			/* r10 holds tz */
-	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
+	get_datapage	r3, r0
 	cmpldi	r11,0			/* check if tv is NULL */
 	beq	2f
 	lis	r7,1000000@ha		/* load up USEC_PER_SEC */
@@ -71,7 +73,7 @@  V_FUNCTION_BEGIN(__kernel_clock_gettime)
 	mflr	r12			/* r12 saves lr */
   .cfi_register lr,r12
 	mr	r11,r4			/* r11 saves tp */
-	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
+	get_datapage	r3, r0		/* get data page */
 	lis	r7,NSEC_PER_SEC@h	/* want nanoseconds */
 	ori	r7,r7,NSEC_PER_SEC@l
 	beq	cr5,70f
@@ -218,7 +220,7 @@  V_FUNCTION_BEGIN(__kernel_time)
   .cfi_register lr,r12
 
 	mr	r11,r3			/* r11 holds t */
-	bl	V_LOCAL_FUNC(__get_datapage)
+	get_datapage	r3, r0
 
 	ld	r4,STAMP_XTIME+TSPC64_TV_SEC(r3)