Message ID | 20190821092959.16066-1-santosh@fossix.org (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | powerpc/vdso64: inline __get_datapage() | expand |
Context | Check | Description |
---|---|---|
snowpatch_ozlabs/apply_patch | success | Successfully applied on branch next (c9633332103e55bc73d80d07ead28b95a22a85a3) |
snowpatch_ozlabs/build-ppc64le | success | Build succeeded |
snowpatch_ozlabs/build-ppc64be | success | Build succeeded |
snowpatch_ozlabs/build-ppc64e | success | Build succeeded |
snowpatch_ozlabs/build-pmac32 | success | Build succeeded |
snowpatch_ozlabs/checkpatch | fail | total: 3 errors, 1 warnings, 0 checks, 126 lines checked |
Le 21/08/2019 à 11:29, Santosh Sivaraj a écrit : > __get_datapage() is only a few instructions to retrieve the > address of the page where the kernel stores data to the VDSO. > > By inlining this function into its users, a bl/blr pair and > a mflr/mtlr pair is avoided, plus a few reg moves. > > clock-gettime-monotonic: syscall: 514 nsec/call 396 nsec/call > clock-gettime-monotonic: libc: 25 nsec/call 24 nsec/call > clock-gettime-monotonic: vdso: 20 nsec/call 20 nsec/call > clock-getres-monotonic: syscall: 347 nsec/call 372 nsec/call > clock-getres-monotonic: libc: 19 nsec/call 19 nsec/call > clock-getres-monotonic: vdso: 10 nsec/call 10 nsec/call > clock-gettime-monotonic-coarse: syscall: 511 nsec/call 396 nsec/call > clock-gettime-monotonic-coarse: libc: 23 nsec/call 21 nsec/call > clock-gettime-monotonic-coarse: vdso: 15 nsec/call 13 nsec/call > clock-gettime-realtime: syscall: 526 nsec/call 405 nsec/call > clock-gettime-realtime: libc: 24 nsec/call 23 nsec/call > clock-gettime-realtime: vdso: 18 nsec/call 18 nsec/call > clock-getres-realtime: syscall: 342 nsec/call 372 nsec/call > clock-getres-realtime: libc: 19 nsec/call 19 nsec/call > clock-getres-realtime: vdso: 10 nsec/call 10 nsec/call > clock-gettime-realtime-coarse: syscall: 515 nsec/call 373 nsec/call > clock-gettime-realtime-coarse: libc: 23 nsec/call 22 nsec/call > clock-gettime-realtime-coarse: vdso: 14 nsec/call 13 nsec/call I think you should only put the measurements on vdso calls, and only the ones that are impacted by the change. For exemple, getres function doesn't use __get_datapage so showing it here is pointless. gettimeofday should be shown there as it uses __get_datapage() > > Based on the patch by Christophe Leroy <christophe.leroy@c-s.fr> for vdso32. > > Signed-off-by: Santosh Sivaraj <santosh@fossix.org> > --- > > except for a couple of calls (1 or 2 nsec reduction), there are no > improvements in the call times. Or is 10 nsec the minimum granularity?? Maybe the ones that show no improvements are the ones that don't use __get_datapage() at all ... > > So I don't know if its even worth updating vdso64 except to keep vdso32 and > vdso64 equal. 2ns on a 15ns call is 13% so it is worth it I think. Christophe > > > arch/powerpc/kernel/vdso64/cacheflush.S | 10 ++++---- > arch/powerpc/kernel/vdso64/datapage.S | 29 ++++------------------- > arch/powerpc/kernel/vdso64/datapage.h | 10 ++++++++ > arch/powerpc/kernel/vdso64/gettimeofday.S | 8 ++++--- > 4 files changed, 24 insertions(+), 33 deletions(-) > create mode 100644 arch/powerpc/kernel/vdso64/datapage.h > > diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S > index 3f92561a64c4..30e8b0d29bea 100644 > --- a/arch/powerpc/kernel/vdso64/cacheflush.S > +++ b/arch/powerpc/kernel/vdso64/cacheflush.S > @@ -10,6 +10,8 @@ > #include <asm/vdso.h> > #include <asm/asm-offsets.h> > > +#include "datapage.h" > + > .text > > /* > @@ -24,14 +26,12 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache) > .cfi_startproc > mflr r12 > .cfi_register lr,r12 > - mr r11,r3 > - bl V_LOCAL_FUNC(__get_datapage) > + get_datapage r11, r0 > mtlr r12 > - mr r10,r3 > > lwz r7,CFG_DCACHE_BLOCKSZ(r10) > addi r5,r7,-1 > - andc r6,r11,r5 /* round low to line bdy */ > + andc r6,r3,r5 /* round low to line bdy */ > subf r8,r6,r4 /* compute length */ > add r8,r8,r5 /* ensure we get enough */ > lwz r9,CFG_DCACHE_LOGBLOCKSZ(r10) > @@ -48,7 +48,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache) > > lwz r7,CFG_ICACHE_BLOCKSZ(r10) > addi r5,r7,-1 > - andc r6,r11,r5 /* round low to line bdy */ > + andc r6,r3,r5 /* round low to line bdy */ > subf r8,r6,r4 /* compute length */ > add r8,r8,r5 > lwz r9,CFG_ICACHE_LOGBLOCKSZ(r10) > diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S > index dc84f5ae3802..8712f57c931c 100644 > --- a/arch/powerpc/kernel/vdso64/datapage.S > +++ b/arch/powerpc/kernel/vdso64/datapage.S > @@ -11,34 +11,13 @@ > #include <asm/unistd.h> > #include <asm/vdso.h> > > +#include "datapage.h" > + > .text > .global __kernel_datapage_offset; > __kernel_datapage_offset: > .long 0 > > -V_FUNCTION_BEGIN(__get_datapage) > - .cfi_startproc > - /* We don't want that exposed or overridable as we want other objects > - * to be able to bl directly to here > - */ > - .protected __get_datapage > - .hidden __get_datapage > - > - mflr r0 > - .cfi_register lr,r0 > - > - bcl 20,31,data_page_branch > -data_page_branch: > - mflr r3 > - mtlr r0 > - addi r3, r3, __kernel_datapage_offset-data_page_branch > - lwz r0,0(r3) > - .cfi_restore lr > - add r3,r0,r3 > - blr > - .cfi_endproc > -V_FUNCTION_END(__get_datapage) > - > /* > * void *__kernel_get_syscall_map(unsigned int *syscall_count) ; > * > @@ -53,7 +32,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map) > mflr r12 > .cfi_register lr,r12 > mr r4,r3 > - bl V_LOCAL_FUNC(__get_datapage) > + get_datapage r3, r0 > mtlr r12 > addi r3,r3,CFG_SYSCALL_MAP64 > cmpldi cr0,r4,0 > @@ -75,7 +54,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) > .cfi_startproc > mflr r12 > .cfi_register lr,r12 > - bl V_LOCAL_FUNC(__get_datapage) > + get_datapage r3, r0 > ld r3,CFG_TB_TICKS_PER_SEC(r3) > mtlr r12 > crclr cr0*4+so > diff --git a/arch/powerpc/kernel/vdso64/datapage.h b/arch/powerpc/kernel/vdso64/datapage.h > new file mode 100644 > index 000000000000..f2f0da0f65f3 > --- /dev/null > +++ b/arch/powerpc/kernel/vdso64/datapage.h > @@ -0,0 +1,10 @@ > +/* SPDX-License-Identifier: GPL-2.0-or-later */ > + > +.macro get_datapage ptr, tmp > + bcl 20,31,888f > +888: > + mflr \ptr > + addi \ptr, \ptr, __kernel_datapage_offset - 888b > + lwz \tmp, 0(\ptr) > + add \ptr, \tmp, \ptr > +.endm > diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S > index 07bfe33fe874..7bcc879392cc 100644 > --- a/arch/powerpc/kernel/vdso64/gettimeofday.S > +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S > @@ -12,6 +12,8 @@ > #include <asm/asm-offsets.h> > #include <asm/unistd.h> > > +#include "datapage.h" > + > .text > /* > * Exact prototype of gettimeofday > @@ -26,7 +28,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) > > mr r11,r3 /* r11 holds tv */ > mr r10,r4 /* r10 holds tz */ > - bl V_LOCAL_FUNC(__get_datapage) /* get data page */ > + get_datapage r3, r0 > cmpldi r11,0 /* check if tv is NULL */ > beq 2f > lis r7,1000000@ha /* load up USEC_PER_SEC */ > @@ -71,7 +73,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) > mflr r12 /* r12 saves lr */ > .cfi_register lr,r12 > mr r11,r4 /* r11 saves tp */ > - bl V_LOCAL_FUNC(__get_datapage) /* get data page */ > + get_datapage r3, r0 /* get data page */ > lis r7,NSEC_PER_SEC@h /* want nanoseconds */ > ori r7,r7,NSEC_PER_SEC@l > beq cr5,70f > @@ -218,7 +220,7 @@ V_FUNCTION_BEGIN(__kernel_time) > .cfi_register lr,r12 > > mr r11,r3 /* r11 holds t */ > - bl V_LOCAL_FUNC(__get_datapage) > + get_datapage r3, r0 > > ld r4,STAMP_XTIME+TSPC64_TV_SEC(r3) > >
Christophe Leroy <christophe.leroy@c-s.fr> writes: > Le 21/08/2019 à 11:29, Santosh Sivaraj a écrit : >> __get_datapage() is only a few instructions to retrieve the >> address of the page where the kernel stores data to the VDSO. >> >> By inlining this function into its users, a bl/blr pair and >> a mflr/mtlr pair is avoided, plus a few reg moves. >> >> clock-gettime-monotonic: syscall: 514 nsec/call 396 nsec/call >> clock-gettime-monotonic: libc: 25 nsec/call 24 nsec/call >> clock-gettime-monotonic: vdso: 20 nsec/call 20 nsec/call >> clock-getres-monotonic: syscall: 347 nsec/call 372 nsec/call >> clock-getres-monotonic: libc: 19 nsec/call 19 nsec/call >> clock-getres-monotonic: vdso: 10 nsec/call 10 nsec/call >> clock-gettime-monotonic-coarse: syscall: 511 nsec/call 396 nsec/call >> clock-gettime-monotonic-coarse: libc: 23 nsec/call 21 nsec/call >> clock-gettime-monotonic-coarse: vdso: 15 nsec/call 13 nsec/call >> clock-gettime-realtime: syscall: 526 nsec/call 405 nsec/call >> clock-gettime-realtime: libc: 24 nsec/call 23 nsec/call >> clock-gettime-realtime: vdso: 18 nsec/call 18 nsec/call >> clock-getres-realtime: syscall: 342 nsec/call 372 nsec/call >> clock-getres-realtime: libc: 19 nsec/call 19 nsec/call >> clock-getres-realtime: vdso: 10 nsec/call 10 nsec/call >> clock-gettime-realtime-coarse: syscall: 515 nsec/call 373 nsec/call >> clock-gettime-realtime-coarse: libc: 23 nsec/call 22 nsec/call >> clock-gettime-realtime-coarse: vdso: 14 nsec/call 13 nsec/call > > I think you should only put the measurements on vdso calls, and only the > ones that are impacted by the change. For exemple, getres function > doesn't use __get_datapage so showing it here is pointless. > > gettimeofday should be shown there as it uses __get_datapage() > > >> >> Based on the patch by Christophe Leroy <christophe.leroy@c-s.fr> for vdso32. >> >> Signed-off-by: Santosh Sivaraj <santosh@fossix.org> >> --- >> >> except for a couple of calls (1 or 2 nsec reduction), there are no >> improvements in the call times. Or is 10 nsec the minimum granularity?? > > Maybe the ones that show no improvements are the ones that don't use > __get_datapage() at all ... Yes makes sense. > >> >> So I don't know if its even worth updating vdso64 except to keep vdso32 and >> vdso64 equal. > > 2ns on a 15ns call is 13% so it is worth it I think. true. Since datapage.h is the same for both 32 and 64, may be we should put it in include/asm. Thanks, Santosh > > Christophe > > >> >> >> arch/powerpc/kernel/vdso64/cacheflush.S | 10 ++++---- >> arch/powerpc/kernel/vdso64/datapage.S | 29 ++++------------------- >> arch/powerpc/kernel/vdso64/datapage.h | 10 ++++++++ >> arch/powerpc/kernel/vdso64/gettimeofday.S | 8 ++++--- >> 4 files changed, 24 insertions(+), 33 deletions(-) >> create mode 100644 arch/powerpc/kernel/vdso64/datapage.h >> >> diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S >> index 3f92561a64c4..30e8b0d29bea 100644 >> --- a/arch/powerpc/kernel/vdso64/cacheflush.S >> +++ b/arch/powerpc/kernel/vdso64/cacheflush.S >> @@ -10,6 +10,8 @@ >> #include <asm/vdso.h> >> #include <asm/asm-offsets.h> >> >> +#include "datapage.h" >> + >> .text >> >> /* >> @@ -24,14 +26,12 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache) >> .cfi_startproc >> mflr r12 >> .cfi_register lr,r12 >> - mr r11,r3 >> - bl V_LOCAL_FUNC(__get_datapage) >> + get_datapage r11, r0 >> mtlr r12 >> - mr r10,r3 >> >> lwz r7,CFG_DCACHE_BLOCKSZ(r10) >> addi r5,r7,-1 >> - andc r6,r11,r5 /* round low to line bdy */ >> + andc r6,r3,r5 /* round low to line bdy */ >> subf r8,r6,r4 /* compute length */ >> add r8,r8,r5 /* ensure we get enough */ >> lwz r9,CFG_DCACHE_LOGBLOCKSZ(r10) >> @@ -48,7 +48,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache) >> >> lwz r7,CFG_ICACHE_BLOCKSZ(r10) >> addi r5,r7,-1 >> - andc r6,r11,r5 /* round low to line bdy */ >> + andc r6,r3,r5 /* round low to line bdy */ >> subf r8,r6,r4 /* compute length */ >> add r8,r8,r5 >> lwz r9,CFG_ICACHE_LOGBLOCKSZ(r10) >> diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S >> index dc84f5ae3802..8712f57c931c 100644 >> --- a/arch/powerpc/kernel/vdso64/datapage.S >> +++ b/arch/powerpc/kernel/vdso64/datapage.S >> @@ -11,34 +11,13 @@ >> #include <asm/unistd.h> >> #include <asm/vdso.h> >> >> +#include "datapage.h" >> + >> .text >> .global __kernel_datapage_offset; >> __kernel_datapage_offset: >> .long 0 >> >> -V_FUNCTION_BEGIN(__get_datapage) >> - .cfi_startproc >> - /* We don't want that exposed or overridable as we want other objects >> - * to be able to bl directly to here >> - */ >> - .protected __get_datapage >> - .hidden __get_datapage >> - >> - mflr r0 >> - .cfi_register lr,r0 >> - >> - bcl 20,31,data_page_branch >> -data_page_branch: >> - mflr r3 >> - mtlr r0 >> - addi r3, r3, __kernel_datapage_offset-data_page_branch >> - lwz r0,0(r3) >> - .cfi_restore lr >> - add r3,r0,r3 >> - blr >> - .cfi_endproc >> -V_FUNCTION_END(__get_datapage) >> - >> /* >> * void *__kernel_get_syscall_map(unsigned int *syscall_count) ; >> * >> @@ -53,7 +32,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map) >> mflr r12 >> .cfi_register lr,r12 >> mr r4,r3 >> - bl V_LOCAL_FUNC(__get_datapage) >> + get_datapage r3, r0 >> mtlr r12 >> addi r3,r3,CFG_SYSCALL_MAP64 >> cmpldi cr0,r4,0 >> @@ -75,7 +54,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) >> .cfi_startproc >> mflr r12 >> .cfi_register lr,r12 >> - bl V_LOCAL_FUNC(__get_datapage) >> + get_datapage r3, r0 >> ld r3,CFG_TB_TICKS_PER_SEC(r3) >> mtlr r12 >> crclr cr0*4+so >> diff --git a/arch/powerpc/kernel/vdso64/datapage.h b/arch/powerpc/kernel/vdso64/datapage.h >> new file mode 100644 >> index 000000000000..f2f0da0f65f3 >> --- /dev/null >> +++ b/arch/powerpc/kernel/vdso64/datapage.h >> @@ -0,0 +1,10 @@ >> +/* SPDX-License-Identifier: GPL-2.0-or-later */ >> + >> +.macro get_datapage ptr, tmp >> + bcl 20,31,888f >> +888: >> + mflr \ptr >> + addi \ptr, \ptr, __kernel_datapage_offset - 888b >> + lwz \tmp, 0(\ptr) >> + add \ptr, \tmp, \ptr >> +.endm >> diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S >> index 07bfe33fe874..7bcc879392cc 100644 >> --- a/arch/powerpc/kernel/vdso64/gettimeofday.S >> +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S >> @@ -12,6 +12,8 @@ >> #include <asm/asm-offsets.h> >> #include <asm/unistd.h> >> >> +#include "datapage.h" >> + >> .text >> /* >> * Exact prototype of gettimeofday >> @@ -26,7 +28,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) >> >> mr r11,r3 /* r11 holds tv */ >> mr r10,r4 /* r10 holds tz */ >> - bl V_LOCAL_FUNC(__get_datapage) /* get data page */ >> + get_datapage r3, r0 >> cmpldi r11,0 /* check if tv is NULL */ >> beq 2f >> lis r7,1000000@ha /* load up USEC_PER_SEC */ >> @@ -71,7 +73,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) >> mflr r12 /* r12 saves lr */ >> .cfi_register lr,r12 >> mr r11,r4 /* r11 saves tp */ >> - bl V_LOCAL_FUNC(__get_datapage) /* get data page */ >> + get_datapage r3, r0 /* get data page */ >> lis r7,NSEC_PER_SEC@h /* want nanoseconds */ >> ori r7,r7,NSEC_PER_SEC@l >> beq cr5,70f >> @@ -218,7 +220,7 @@ V_FUNCTION_BEGIN(__kernel_time) >> .cfi_register lr,r12 >> >> mr r11,r3 /* r11 holds t */ >> - bl V_LOCAL_FUNC(__get_datapage) >> + get_datapage r3, r0 >> >> ld r4,STAMP_XTIME+TSPC64_TV_SEC(r3) >> >>
Hi! On Wed, Aug 21, 2019 at 02:59:59PM +0530, Santosh Sivaraj wrote: > except for a couple of calls (1 or 2 nsec reduction), there are no > improvements in the call times. Or is 10 nsec the minimum granularity?? > > So I don't know if its even worth updating vdso64 except to keep vdso32 and > vdso64 equal. Calls are cheap, in principle... It is the LR stuff that can make it slower on some cores, and a lot of calling sequence stuff may have considerable overhead of course. > +.macro get_datapage ptr, tmp > + bcl 20,31,888f > +888: > + mflr \ptr > + addi \ptr, \ptr, __kernel_datapage_offset - 888b > + lwz \tmp, 0(\ptr) > + add \ptr, \tmp, \ptr > +.endm (You can just write that as bcl 20,31,$+4 mflr \ptr etc. Useless labels are useless :-) ) One thing you might want to do to improve performance is to do this without the bcl etc., because you cannot really hide the LR latency of that. But that isn't very many ns either... Superscalar helps, OoO helps, but it is mostly just that >100MHz helps ;-) Segher
Le 21/08/2019 à 13:44, Segher Boessenkool a écrit : > Hi! > > On Wed, Aug 21, 2019 at 02:59:59PM +0530, Santosh Sivaraj wrote: >> except for a couple of calls (1 or 2 nsec reduction), there are no >> improvements in the call times. Or is 10 nsec the minimum granularity?? >> >> So I don't know if its even worth updating vdso64 except to keep vdso32 and >> vdso64 equal. > > Calls are cheap, in principle... It is the LR stuff that can make it > slower on some cores, and a lot of calling sequence stuff may have > considerable overhead of course. On an 8xx, a taken branch is 2 cycles and a non taken branch in 1 cycle (+ the refetch if that was not the anticipate branch). > >> +.macro get_datapage ptr, tmp >> + bcl 20,31,888f >> +888: >> + mflr \ptr >> + addi \ptr, \ptr, __kernel_datapage_offset - 888b >> + lwz \tmp, 0(\ptr) >> + add \ptr, \tmp, \ptr >> +.endm > > (You can just write that as > bcl 20,31,$+4 > mflr \ptr > etc. Useless labels are useless :-) ) Nice trick. Will use that. > > One thing you might want to do to improve performance is to do this without > the bcl etc., because you cannot really hide the LR latency of that. But > that isn't very many ns either... Superscalar helps, OoO helps, but it is > mostly just that >100MHz helps ;-) Good idea. Did you have a look at my vdso32 similar patch ? https://patchwork.ozlabs.org/patch/1148274/ Do you have any idea on how to avoid that bcl/mflr stuff ? Christophe
On Wed, Aug 21, 2019 at 01:50:52PM +0200, Christophe Leroy wrote: > Le 21/08/2019 à 13:44, Segher Boessenkool a écrit : > >Calls are cheap, in principle... It is the LR stuff that can make it > >slower on some cores, and a lot of calling sequence stuff may have > >considerable overhead of course. > > On an 8xx, a taken branch is 2 cycles and a non taken branch in 1 cycle > (+ the refetch if that was not the anticipate branch). Yup. And on the big cores they are all 0 cycles, if correctly predicted. (Taken branches end your fetch group, of course, there are small inefficiencies everywhere, but that's about the gist of it). > >>+.macro get_datapage ptr, tmp > >>+ bcl 20,31,888f > >>+888: > >>+ mflr \ptr > >>+ addi \ptr, \ptr, __kernel_datapage_offset - 888b > >>+ lwz \tmp, 0(\ptr) > >>+ add \ptr, \tmp, \ptr > >>+.endm > > > >(You can just write that as > > bcl 20,31,$+4 > > mflr \ptr > >etc. Useless labels are useless :-) ) > > Nice trick. Will use that. Or .+4 if you like that syntax better... It's all the same thing. > >One thing you might want to do to improve performance is to do this without > >the bcl etc., because you cannot really hide the LR latency of that. But > >that isn't very many ns either... Superscalar helps, OoO helps, but it is > >mostly just that >100MHz helps ;-) > > Good idea. Did you have a look at my vdso32 similar patch ? > https://patchwork.ozlabs.org/patch/1148274/ Yes, I saw it. > Do you have any idea on how to avoid that bcl/mflr stuff ? Do a load from some fixed address? Maybe an absolute address, even? lwz r3,-12344(0) or similar (that address is in kernel space...) There aren't many options, and certainly not many *good* options! Segher
Christophe Leroy <christophe.leroy@c-s.fr> writes: > Le 21/08/2019 à 11:29, Santosh Sivaraj a écrit : >> __get_datapage() is only a few instructions to retrieve the >> address of the page where the kernel stores data to the VDSO. >> >> By inlining this function into its users, a bl/blr pair and >> a mflr/mtlr pair is avoided, plus a few reg moves. >> >> clock-gettime-monotonic: syscall: 514 nsec/call 396 nsec/call >> clock-gettime-monotonic: libc: 25 nsec/call 24 nsec/call >> clock-gettime-monotonic: vdso: 20 nsec/call 20 nsec/call >> clock-getres-monotonic: syscall: 347 nsec/call 372 nsec/call >> clock-getres-monotonic: libc: 19 nsec/call 19 nsec/call >> clock-getres-monotonic: vdso: 10 nsec/call 10 nsec/call >> clock-gettime-monotonic-coarse: syscall: 511 nsec/call 396 nsec/call >> clock-gettime-monotonic-coarse: libc: 23 nsec/call 21 nsec/call >> clock-gettime-monotonic-coarse: vdso: 15 nsec/call 13 nsec/call >> clock-gettime-realtime: syscall: 526 nsec/call 405 nsec/call >> clock-gettime-realtime: libc: 24 nsec/call 23 nsec/call >> clock-gettime-realtime: vdso: 18 nsec/call 18 nsec/call >> clock-getres-realtime: syscall: 342 nsec/call 372 nsec/call >> clock-getres-realtime: libc: 19 nsec/call 19 nsec/call >> clock-getres-realtime: vdso: 10 nsec/call 10 nsec/call >> clock-gettime-realtime-coarse: syscall: 515 nsec/call 373 nsec/call >> clock-gettime-realtime-coarse: libc: 23 nsec/call 22 nsec/call >> clock-gettime-realtime-coarse: vdso: 14 nsec/call 13 nsec/call > > I think you should only put the measurements on vdso calls, and only the > ones that are impacted by the change. For exemple, getres function > doesn't use __get_datapage so showing it here is pointless. I agree with this point, but also, I would caution against using vdsotest's benchmark function for anything like rigorous performance analysis. The intention was to roughly confirm the VDSO's relative performance vs the in-kernel implementations. Not to compare one VDSO implementation of (say) clock_gettime to another. I suggest using perf to confirm the expected effects of the change, if possible.
Le 21/08/2019 à 14:15, Segher Boessenkool a écrit : > On Wed, Aug 21, 2019 at 01:50:52PM +0200, Christophe Leroy wrote: >> Do you have any idea on how to avoid that bcl/mflr stuff ? > > Do a load from some fixed address? Maybe an absolute address, even? > lwz r3,-12344(0) or similar (that address is in kernel space...) > > There aren't many options, and certainly not many *good* options! > IIUC, the VDSO is seen by apps the same way as a dynamic lib. Couldn't the relocation be done only once when the app loads the VDSO as for a regular .so lib ? It looks like it is what others do, at least x86 and arm64, unless I misunderstood their code. Christophe
Christophe Leroy <christophe.leroy@c-s.fr> writes: > Le 21/08/2019 à 14:15, Segher Boessenkool a écrit : >> On Wed, Aug 21, 2019 at 01:50:52PM +0200, Christophe Leroy wrote: >>> Do you have any idea on how to avoid that bcl/mflr stuff ? >> >> Do a load from some fixed address? Maybe an absolute address, even? >> lwz r3,-12344(0) or similar (that address is in kernel space...) >> >> There aren't many options, and certainly not many *good* options! >> > > IIUC, the VDSO is seen by apps the same way as a dynamic lib. Couldn't > the relocation be done only once when the app loads the VDSO as for a > regular .so lib ? How does address space randomization work for .so libs? > > It looks like it is what others do, at least x86 and arm64, unless I > misunderstood their code. > > Christophe
diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S index 3f92561a64c4..30e8b0d29bea 100644 --- a/arch/powerpc/kernel/vdso64/cacheflush.S +++ b/arch/powerpc/kernel/vdso64/cacheflush.S @@ -10,6 +10,8 @@ #include <asm/vdso.h> #include <asm/asm-offsets.h> +#include "datapage.h" + .text /* @@ -24,14 +26,12 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache) .cfi_startproc mflr r12 .cfi_register lr,r12 - mr r11,r3 - bl V_LOCAL_FUNC(__get_datapage) + get_datapage r11, r0 mtlr r12 - mr r10,r3 lwz r7,CFG_DCACHE_BLOCKSZ(r10) addi r5,r7,-1 - andc r6,r11,r5 /* round low to line bdy */ + andc r6,r3,r5 /* round low to line bdy */ subf r8,r6,r4 /* compute length */ add r8,r8,r5 /* ensure we get enough */ lwz r9,CFG_DCACHE_LOGBLOCKSZ(r10) @@ -48,7 +48,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache) lwz r7,CFG_ICACHE_BLOCKSZ(r10) addi r5,r7,-1 - andc r6,r11,r5 /* round low to line bdy */ + andc r6,r3,r5 /* round low to line bdy */ subf r8,r6,r4 /* compute length */ add r8,r8,r5 lwz r9,CFG_ICACHE_LOGBLOCKSZ(r10) diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S index dc84f5ae3802..8712f57c931c 100644 --- a/arch/powerpc/kernel/vdso64/datapage.S +++ b/arch/powerpc/kernel/vdso64/datapage.S @@ -11,34 +11,13 @@ #include <asm/unistd.h> #include <asm/vdso.h> +#include "datapage.h" + .text .global __kernel_datapage_offset; __kernel_datapage_offset: .long 0 -V_FUNCTION_BEGIN(__get_datapage) - .cfi_startproc - /* We don't want that exposed or overridable as we want other objects - * to be able to bl directly to here - */ - .protected __get_datapage - .hidden __get_datapage - - mflr r0 - .cfi_register lr,r0 - - bcl 20,31,data_page_branch -data_page_branch: - mflr r3 - mtlr r0 - addi r3, r3, __kernel_datapage_offset-data_page_branch - lwz r0,0(r3) - .cfi_restore lr - add r3,r0,r3 - blr - .cfi_endproc -V_FUNCTION_END(__get_datapage) - /* * void *__kernel_get_syscall_map(unsigned int *syscall_count) ; * @@ -53,7 +32,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map) mflr r12 .cfi_register lr,r12 mr r4,r3 - bl V_LOCAL_FUNC(__get_datapage) + get_datapage r3, r0 mtlr r12 addi r3,r3,CFG_SYSCALL_MAP64 cmpldi cr0,r4,0 @@ -75,7 +54,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) .cfi_startproc mflr r12 .cfi_register lr,r12 - bl V_LOCAL_FUNC(__get_datapage) + get_datapage r3, r0 ld r3,CFG_TB_TICKS_PER_SEC(r3) mtlr r12 crclr cr0*4+so diff --git a/arch/powerpc/kernel/vdso64/datapage.h b/arch/powerpc/kernel/vdso64/datapage.h new file mode 100644 index 000000000000..f2f0da0f65f3 --- /dev/null +++ b/arch/powerpc/kernel/vdso64/datapage.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +.macro get_datapage ptr, tmp + bcl 20,31,888f +888: + mflr \ptr + addi \ptr, \ptr, __kernel_datapage_offset - 888b + lwz \tmp, 0(\ptr) + add \ptr, \tmp, \ptr +.endm diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S index 07bfe33fe874..7bcc879392cc 100644 --- a/arch/powerpc/kernel/vdso64/gettimeofday.S +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S @@ -12,6 +12,8 @@ #include <asm/asm-offsets.h> #include <asm/unistd.h> +#include "datapage.h" + .text /* * Exact prototype of gettimeofday @@ -26,7 +28,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) mr r11,r3 /* r11 holds tv */ mr r10,r4 /* r10 holds tz */ - bl V_LOCAL_FUNC(__get_datapage) /* get data page */ + get_datapage r3, r0 cmpldi r11,0 /* check if tv is NULL */ beq 2f lis r7,1000000@ha /* load up USEC_PER_SEC */ @@ -71,7 +73,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) mflr r12 /* r12 saves lr */ .cfi_register lr,r12 mr r11,r4 /* r11 saves tp */ - bl V_LOCAL_FUNC(__get_datapage) /* get data page */ + get_datapage r3, r0 /* get data page */ lis r7,NSEC_PER_SEC@h /* want nanoseconds */ ori r7,r7,NSEC_PER_SEC@l beq cr5,70f @@ -218,7 +220,7 @@ V_FUNCTION_BEGIN(__kernel_time) .cfi_register lr,r12 mr r11,r3 /* r11 holds t */ - bl V_LOCAL_FUNC(__get_datapage) + get_datapage r3, r0 ld r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
__get_datapage() is only a few instructions to retrieve the address of the page where the kernel stores data to the VDSO. By inlining this function into its users, a bl/blr pair and a mflr/mtlr pair is avoided, plus a few reg moves. clock-gettime-monotonic: syscall: 514 nsec/call 396 nsec/call clock-gettime-monotonic: libc: 25 nsec/call 24 nsec/call clock-gettime-monotonic: vdso: 20 nsec/call 20 nsec/call clock-getres-monotonic: syscall: 347 nsec/call 372 nsec/call clock-getres-monotonic: libc: 19 nsec/call 19 nsec/call clock-getres-monotonic: vdso: 10 nsec/call 10 nsec/call clock-gettime-monotonic-coarse: syscall: 511 nsec/call 396 nsec/call clock-gettime-monotonic-coarse: libc: 23 nsec/call 21 nsec/call clock-gettime-monotonic-coarse: vdso: 15 nsec/call 13 nsec/call clock-gettime-realtime: syscall: 526 nsec/call 405 nsec/call clock-gettime-realtime: libc: 24 nsec/call 23 nsec/call clock-gettime-realtime: vdso: 18 nsec/call 18 nsec/call clock-getres-realtime: syscall: 342 nsec/call 372 nsec/call clock-getres-realtime: libc: 19 nsec/call 19 nsec/call clock-getres-realtime: vdso: 10 nsec/call 10 nsec/call clock-gettime-realtime-coarse: syscall: 515 nsec/call 373 nsec/call clock-gettime-realtime-coarse: libc: 23 nsec/call 22 nsec/call clock-gettime-realtime-coarse: vdso: 14 nsec/call 13 nsec/call Based on the patch by Christophe Leroy <christophe.leroy@c-s.fr> for vdso32. Signed-off-by: Santosh Sivaraj <santosh@fossix.org> --- except for a couple of calls (1 or 2 nsec reduction), there are no improvements in the call times. Or is 10 nsec the minimum granularity?? So I don't know if its even worth updating vdso64 except to keep vdso32 and vdso64 equal. arch/powerpc/kernel/vdso64/cacheflush.S | 10 ++++---- arch/powerpc/kernel/vdso64/datapage.S | 29 ++++------------------- arch/powerpc/kernel/vdso64/datapage.h | 10 ++++++++ arch/powerpc/kernel/vdso64/gettimeofday.S | 8 ++++--- 4 files changed, 24 insertions(+), 33 deletions(-) create mode 100644 arch/powerpc/kernel/vdso64/datapage.h