diff mbox series

[v10,2/5] powerpc/vdso: Prepare for switching VDSO to generic C implementation.

Message ID 348528c33cd4007f3fee7fe643ef160843d09a6c.1596611196.git.christophe.leroy@csgroup.eu (mailing list archive)
State Superseded
Headers show
Series powerpc: switch VDSO to C implementation | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (14fd53d1e5ee7350564cac75e336f8c0dea13bc9)
snowpatch_ozlabs/checkpatch warning total: 0 errors, 8 warnings, 4 checks, 257 lines checked
snowpatch_ozlabs/needsstable success Patch has no Fixes tags

Commit Message

Christophe Leroy Aug. 5, 2020, 7:09 a.m. UTC
Prepare for switching VDSO to generic C implementation in following
patch. Here, we:
- Prepare the helpers to call the C VDSO functions
- Prepare the required callbacks for the C VDSO functions
- Prepare the clocksource.h files to define VDSO_ARCH_CLOCKMODES
- Add the C trampolines to the generic C VDSO functions

powerpc is a bit special for VDSO as well as system calls in the
way that it requires setting CR SO bit which cannot be done in C.
Therefore, entry/exit needs to be performed in ASM.

Implementing __arch_get_vdso_data() would clobber the link register,
requiring the caller to save it. As the ASM calling function already
has to set a stack frame and saves the link register before calling
the C vdso function, retriving the vdso data pointer there is lighter.

Implement __arch_vdso_capable() and:
- When the timebase is used, make it always return true.
- When the RTC clock is used, make it always return false.

Provide vdso_shift_ns(), as the generic x >> s gives the following
bad result:

  18:	35 25 ff e0 	addic.  r9,r5,-32
  1c:	41 80 00 10 	blt     2c <shift+0x14>
  20:	7c 64 4c 30 	srw     r4,r3,r9
  24:	38 60 00 00 	li      r3,0
...
  2c:	54 69 08 3c 	rlwinm  r9,r3,1,0,30
  30:	21 45 00 1f 	subfic  r10,r5,31
  34:	7c 84 2c 30 	srw     r4,r4,r5
  38:	7d 29 50 30 	slw     r9,r9,r10
  3c:	7c 63 2c 30 	srw     r3,r3,r5
  40:	7d 24 23 78 	or      r4,r9,r4

In our case the shift is always <= 32. In addition,  the upper 32 bits
of the result are likely nul. Lets GCC know it, it also optimises the
following calculations.

With the patch, we get:
   0:	21 25 00 20 	subfic  r9,r5,32
   4:	7c 69 48 30 	slw     r9,r3,r9
   8:	7c 84 2c 30 	srw     r4,r4,r5
   c:	7d 24 23 78 	or      r4,r9,r4
  10:	7c 63 2c 30 	srw     r3,r3,r5

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
v10:
- Added a comment to explain the reason for the two stack frames.
- Moved back the .cfi_register next to mflr

v9:
- No more modification of __get_datapage(). Offset is added after.
- Adding a second stack frame because the PPC VDSO ABI doesn't force
the caller to set one.

v8:
- New, splitted out of last patch of the series
---
 arch/powerpc/include/asm/clocksource.h       |   7 +
 arch/powerpc/include/asm/vdso/clocksource.h  |   7 +
 arch/powerpc/include/asm/vdso/gettimeofday.h | 185 +++++++++++++++++++
 arch/powerpc/kernel/vdso32/vgettimeofday.c   |  29 +++
 arch/powerpc/kernel/vdso64/vgettimeofday.c   |  29 +++
 5 files changed, 257 insertions(+)
 create mode 100644 arch/powerpc/include/asm/clocksource.h
 create mode 100644 arch/powerpc/include/asm/vdso/clocksource.h
 create mode 100644 arch/powerpc/include/asm/vdso/gettimeofday.h
 create mode 100644 arch/powerpc/kernel/vdso32/vgettimeofday.c
 create mode 100644 arch/powerpc/kernel/vdso64/vgettimeofday.c

Comments

Segher Boessenkool Aug. 5, 2020, 2:03 p.m. UTC | #1
Hi!

On Wed, Aug 05, 2020 at 07:09:23AM +0000, Christophe Leroy wrote:
> Provide vdso_shift_ns(), as the generic x >> s gives the following
> bad result:
> 
>   18:	35 25 ff e0 	addic.  r9,r5,-32
>   1c:	41 80 00 10 	blt     2c <shift+0x14>
>   20:	7c 64 4c 30 	srw     r4,r3,r9
>   24:	38 60 00 00 	li      r3,0
> ...
>   2c:	54 69 08 3c 	rlwinm  r9,r3,1,0,30
>   30:	21 45 00 1f 	subfic  r10,r5,31
>   34:	7c 84 2c 30 	srw     r4,r4,r5
>   38:	7d 29 50 30 	slw     r9,r9,r10
>   3c:	7c 63 2c 30 	srw     r3,r3,r5
>   40:	7d 24 23 78 	or      r4,r9,r4
> 
> In our case the shift is always <= 32. In addition,  the upper 32 bits
> of the result are likely nul. Lets GCC know it, it also optimises the
> following calculations.
> 
> With the patch, we get:
>    0:	21 25 00 20 	subfic  r9,r5,32
>    4:	7c 69 48 30 	slw     r9,r3,r9
>    8:	7c 84 2c 30 	srw     r4,r4,r5
>    c:	7d 24 23 78 	or      r4,r9,r4
>   10:	7c 63 2c 30 	srw     r3,r3,r5

See below.  Such code is valid on PowerPC for all shift < 64, and a
future version of GCC will do that (it is on various TODO lists, it is
bound to happen *some* day ;-), but it won't help you yet of course).


> +/*
> + * The macros sets two stack frames, one for the caller and one for the callee
> + * because there are no requirement for the caller to set a stack frame when
> + * calling VDSO so it may have omitted to set one, especially on PPC64
> + */

If the caller follows the ABI, there always is a stack frame.  So what
is going on?


> +/*
> + * powerpc specific delta calculation.
> + *
> + * This variant removes the masking of the subtraction because the
> + * clocksource mask of all VDSO capable clocksources on powerpc is U64_MAX
> + * which would result in a pointless operation. The compiler cannot
> + * optimize it away as the mask comes from the vdso data and is not compile
> + * time constant.
> + */

It cannot optimise it because it does not know shift < 32.  The code
below is incorrect for shift equal to 32, fwiw.

> +static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
> +{
> +	return (cycles - last) * mult;
> +}
> +#define vdso_calc_delta vdso_calc_delta
> +
> +#ifndef __powerpc64__
> +static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
> +{
> +	u32 hi = ns >> 32;
> +	u32 lo = ns;
> +
> +	lo >>= shift;
> +	lo |= hi << (32 - shift);
> +	hi >>= shift;


> +	if (likely(hi == 0))
> +		return lo;

Removing these two lines shouldn't change generated object code?  Or not
make it worse, at least.

> +	return ((u64)hi << 32) | lo;
> +}


What does the compiler do for just

static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
	return ns >> (shift & 31);
}

?


Segher
Christophe Leroy Aug. 5, 2020, 4:40 p.m. UTC | #2
Hi,

On 08/05/2020 02:03 PM, Segher Boessenkool wrote:
> Hi!
> 
> On Wed, Aug 05, 2020 at 07:09:23AM +0000, Christophe Leroy wrote:
>> +/*
>> + * powerpc specific delta calculation.
>> + *
>> + * This variant removes the masking of the subtraction because the
>> + * clocksource mask of all VDSO capable clocksources on powerpc is U64_MAX
>> + * which would result in a pointless operation. The compiler cannot
>> + * optimize it away as the mask comes from the vdso data and is not compile
>> + * time constant.
>> + */
> 
> It cannot optimise it because it does not know shift < 32.  The code
> below is incorrect for shift equal to 32, fwiw.

Is there a way to tell it ?

> 
>> +static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
>> +{
>> +	return (cycles - last) * mult;
>> +}
>> +#define vdso_calc_delta vdso_calc_delta
>> +
>> +#ifndef __powerpc64__
>> +static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
>> +{
>> +	u32 hi = ns >> 32;
>> +	u32 lo = ns;
>> +
>> +	lo >>= shift;
>> +	lo |= hi << (32 - shift);
>> +	hi >>= shift;
> 
> 
>> +	if (likely(hi == 0))
>> +		return lo;
> 
> Removing these two lines shouldn't change generated object code?  Or not
> make it worse, at least.

I remember it made noticeable difference allthough I can't remember the 
details. See below with GCC 10.1. At least we see that with those two 
lines, GCC only sets a 16 bytes stack frame. Without those lines it sets 
a 32 bytes stack frame and seems to save some values for no reason.

With the two lines:

000006ac <__c_kernel_clock_gettime>:
  6ac:	28 03 00 0f 	cmplwi  r3,15
  6b0:	41 81 01 04 	bgt     7b4 <__c_kernel_clock_gettime+0x108>
  6b4:	39 40 00 01 	li      r10,1
  6b8:	7d 4a 18 30 	slw     r10,r10,r3
  6bc:	71 47 08 83 	andi.   r7,r10,2179
  6c0:	41 82 01 2c 	beq     7ec <__c_kernel_clock_gettime+0x140>
  6c4:	94 21 ff f0 	stwu    r1,-16(r1)
  6c8:	54 63 20 36 	rlwinm  r3,r3,4,0,27
  6cc:	93 e1 00 0c 	stw     r31,12(r1)
  6d0:	7d 85 1a 14 	add     r12,r5,r3
  6d4:	80 05 00 00 	lwz     r0,0(r5)
  6d8:	70 06 00 01 	andi.   r6,r0,1
  6dc:	40 82 00 d4 	bne     7b0 <__c_kernel_clock_gettime+0x104>
  6e0:	7d 4d 42 e6 	mftbu   r10
  6e4:	7d 6c 42 e6 	mftb    r11
  6e8:	7c ed 42 e6 	mftbu   r7
  6ec:	7c 0a 38 40 	cmplw   r10,r7
  6f0:	40 82 ff f0 	bne     6e0 <__c_kernel_clock_gettime+0x34>
  6f4:	80 e5 00 0c 	lwz     r7,12(r5)
  6f8:	80 65 00 08 	lwz     r3,8(r5)
  6fc:	7c e7 58 10 	subfc   r7,r7,r11
  700:	81 65 00 18 	lwz     r11,24(r5)
  704:	7d 43 51 10 	subfe   r10,r3,r10
  708:	7f e7 58 16 	mulhwu  r31,r7,r11
  70c:	7d 4a 59 d6 	mullw   r10,r10,r11
  710:	7c e7 59 d6 	mullw   r7,r7,r11
  714:	80 6c 00 2c 	lwz     r3,44(r12)
  718:	81 6c 00 28 	lwz     r11,40(r12)
  71c:	7c e7 18 14 	addc    r7,r7,r3
  720:	7d 4a fa 14 	add     r10,r10,r31
  724:	80 65 00 1c 	lwz     r3,28(r5)
  728:	7d 4a 59 14 	adde    r10,r10,r11
  72c:	7c e7 1c 30 	srw     r7,r7,r3
  730:	21 63 00 20 	subfic  r11,r3,32
  734:	7d 43 1c 31 	srw.    r3,r10,r3
  738:	7d 4a 58 30 	slw     r10,r10,r11
  73c:	7d 49 3b 78 	or      r9,r10,r7
  740:	39 00 00 00 	li      r8,0
  744:	40 82 00 84 	bne     7c8 <__c_kernel_clock_gettime+0x11c>
  748:	80 6c 00 24 	lwz     r3,36(r12)
  74c:	81 45 00 00 	lwz     r10,0(r5)
  750:	7c 00 50 40 	cmplw   r0,r10
  754:	40 a2 ff 80 	bne     6d4 <__c_kernel_clock_gettime+0x28>
  758:	2c 08 00 00 	cmpwi   r8,0
  75c:	41 82 00 7c 	beq     7d8 <__c_kernel_clock_gettime+0x12c>
  760:	3c e0 c4 65 	lis     r7,-15259
  764:	3c 00 3b 9a 	lis     r0,15258
  768:	60 e7 36 00 	ori     r7,r7,13824
  76c:	60 00 c9 ff 	ori     r0,r0,51711
  770:	7c a9 38 14 	addc    r5,r9,r7
  774:	7d 48 01 d4 	addme   r10,r8
  778:	2c 0a 00 00 	cmpwi   r10,0
  77c:	7d 48 53 78 	mr      r8,r10
  780:	7c a9 2b 78 	mr      r9,r5
  784:	38 c6 00 01 	addi    r6,r6,1
  788:	40 82 ff e8 	bne     770 <__c_kernel_clock_gettime+0xc4>
  78c:	7c 05 00 40 	cmplw   r5,r0
  790:	41 81 ff e0 	bgt     770 <__c_kernel_clock_gettime+0xc4>
  794:	7c 66 18 14 	addc    r3,r6,r3
  798:	90 64 00 00 	stw     r3,0(r4)
  79c:	91 24 00 04 	stw     r9,4(r4)
  7a0:	38 60 00 00 	li      r3,0
  7a4:	83 e1 00 0c 	lwz     r31,12(r1)
  7a8:	38 21 00 10 	addi    r1,r1,16
  7ac:	4e 80 00 20 	blr
  7b0:	4b ff ff 24 	b       6d4 <__c_kernel_clock_gettime+0x28>
  7b4:	38 00 00 f6 	li      r0,246
  7b8:	44 00 00 02 	sc
  7bc:	40 a3 00 08 	bns     7c4 <__c_kernel_clock_gettime+0x118>
  7c0:	7c 63 00 d0 	neg     r3,r3
  7c4:	4e 80 00 20 	blr
  7c8:	7d 2a 4b 78 	mr      r10,r9
  7cc:	7c 68 1b 78 	mr      r8,r3
  7d0:	7d 49 53 78 	mr      r9,r10
  7d4:	4b ff ff 74 	b       748 <__c_kernel_clock_gettime+0x9c>
  7d8:	3d 40 3b 9a 	lis     r10,15258
  7dc:	61 4a c9 ff 	ori     r10,r10,51711
  7e0:	7c 09 50 40 	cmplw   r9,r10
  7e4:	41 81 ff 7c 	bgt     760 <__c_kernel_clock_gettime+0xb4>
  7e8:	4b ff ff b0 	b       798 <__c_kernel_clock_gettime+0xec>
  7ec:	71 47 00 60 	andi.   r7,r10,96
  7f0:	54 69 20 36 	rlwinm  r9,r3,4,0,27
  7f4:	7d 25 4a 14 	add     r9,r5,r9
  7f8:	40 82 00 14 	bne     80c <__c_kernel_clock_gettime+0x160>
  7fc:	71 4a 00 10 	andi.   r10,r10,16
  800:	41 a2 ff b4 	beq     7b4 <__c_kernel_clock_gettime+0x108>
  804:	38 a5 00 f0 	addi    r5,r5,240
  808:	4b ff fe bc 	b       6c4 <__c_kernel_clock_gettime+0x18>
  80c:	81 05 00 00 	lwz     r8,0(r5)
  810:	71 0a 00 01 	andi.   r10,r8,1
  814:	40 a2 ff f8 	bne     80c <__c_kernel_clock_gettime+0x160>
  818:	80 69 00 24 	lwz     r3,36(r9)
  81c:	81 49 00 2c 	lwz     r10,44(r9)
  820:	80 e5 00 00 	lwz     r7,0(r5)
  824:	7c 08 38 40 	cmplw   r8,r7
  828:	40 a2 ff e4 	bne     80c <__c_kernel_clock_gettime+0x160>
  82c:	90 64 00 00 	stw     r3,0(r4)
  830:	91 44 00 04 	stw     r10,4(r4)
  834:	38 60 00 00 	li      r3,0
  838:	4e 80 00 20 	blr


Without the two lines:

000006ac <__c_kernel_clock_gettime>:
  6ac:	28 03 00 0f 	cmplwi  r3,15
  6b0:	41 81 01 14 	bgt     7c4 <__c_kernel_clock_gettime+0x118>
  6b4:	39 20 00 01 	li      r9,1
  6b8:	7d 29 18 30 	slw     r9,r9,r3
  6bc:	71 2a 08 83 	andi.   r10,r9,2179
  6c0:	41 82 01 2c 	beq     7ec <__c_kernel_clock_gettime+0x140>
  6c4:	94 21 ff e0 	stwu    r1,-32(r1)
  6c8:	54 63 20 36 	rlwinm  r3,r3,4,0,27
  6cc:	93 81 00 10 	stw     r28,16(r1)
  6d0:	93 a1 00 14 	stw     r29,20(r1)
  6d4:	93 c1 00 18 	stw     r30,24(r1)
  6d8:	93 e1 00 1c 	stw     r31,28(r1)
  6dc:	7c 65 1a 14 	add     r3,r5,r3
  6e0:	81 85 00 00 	lwz     r12,0(r5)
  6e4:	71 87 00 01 	andi.   r7,r12,1
  6e8:	40 82 00 d8 	bne     7c0 <__c_kernel_clock_gettime+0x114>
  6ec:	7d 2d 42 e6 	mftbu   r9
  6f0:	7c cc 42 e6 	mftb    r6
  6f4:	7d 4d 42 e6 	mftbu   r10
  6f8:	7c 09 50 40 	cmplw   r9,r10
  6fc:	40 82 ff f0 	bne     6ec <__c_kernel_clock_gettime+0x40>
  700:	83 83 00 28 	lwz     r28,40(r3)
  704:	83 a3 00 2c 	lwz     r29,44(r3)
  708:	81 65 00 08 	lwz     r11,8(r5)
  70c:	81 05 00 0c 	lwz     r8,12(r5)
  710:	83 c5 00 18 	lwz     r30,24(r5)
  714:	83 e5 00 1c 	lwz     r31,28(r5)
  718:	80 03 00 24 	lwz     r0,36(r3)
  71c:	81 45 00 00 	lwz     r10,0(r5)
  720:	7c 0c 50 40 	cmplw   r12,r10
  724:	40 a2 ff bc 	bne     6e0 <__c_kernel_clock_gettime+0x34>
  728:	7d 48 30 10 	subfc   r10,r8,r6
  72c:	7c cb 49 10 	subfe   r6,r11,r9
  730:	7c c6 f1 d6 	mullw   r6,r6,r30
  734:	7d 2a f0 16 	mulhwu  r9,r10,r30
  738:	7d 4a f1 d6 	mullw   r10,r10,r30
  73c:	7c c6 4a 14 	add     r6,r6,r9
  740:	7d 4a e8 14 	addc    r10,r10,r29
  744:	7c c6 e1 14 	adde    r6,r6,r28
  748:	7c c8 fc 30 	srw     r8,r6,r31
  74c:	2c 08 00 00 	cmpwi   r8,0
  750:	20 bf 00 20 	subfic  r5,r31,32
  754:	7d 4a fc 30 	srw     r10,r10,r31
  758:	7c c5 28 30 	slw     r5,r6,r5
  75c:	7c a9 53 78 	or      r9,r5,r10
  760:	41 82 00 78 	beq     7d8 <__c_kernel_clock_gettime+0x12c>
  764:	3c c0 c4 65 	lis     r6,-15259
  768:	3c 60 3b 9a 	lis     r3,15258
  76c:	60 c6 36 00 	ori     r6,r6,13824
  770:	60 63 c9 ff 	ori     r3,r3,51711
  774:	7c a9 30 14 	addc    r5,r9,r6
  778:	7d 48 01 d4 	addme   r10,r8
  77c:	2c 0a 00 00 	cmpwi   r10,0
  780:	7d 48 53 78 	mr      r8,r10
  784:	7c a9 2b 78 	mr      r9,r5
  788:	38 e7 00 01 	addi    r7,r7,1
  78c:	40 82 ff e8 	bne     774 <__c_kernel_clock_gettime+0xc8>
  790:	7c 05 18 40 	cmplw   r5,r3
  794:	41 81 ff e0 	bgt     774 <__c_kernel_clock_gettime+0xc8>
  798:	7c 07 00 14 	addc    r0,r7,r0
  79c:	90 04 00 00 	stw     r0,0(r4)
  7a0:	91 24 00 04 	stw     r9,4(r4)
  7a4:	38 60 00 00 	li      r3,0
  7a8:	83 81 00 10 	lwz     r28,16(r1)
  7ac:	83 a1 00 14 	lwz     r29,20(r1)
  7b0:	83 c1 00 18 	lwz     r30,24(r1)
  7b4:	83 e1 00 1c 	lwz     r31,28(r1)
  7b8:	38 21 00 20 	addi    r1,r1,32
  7bc:	4e 80 00 20 	blr
  7c0:	4b ff ff 20 	b       6e0 <__c_kernel_clock_gettime+0x34>
  7c4:	38 00 00 f6 	li      r0,246
  7c8:	44 00 00 02 	sc
  7cc:	40 a3 00 08 	bns     7d4 <__c_kernel_clock_gettime+0x128>
  7d0:	7c 63 00 d0 	neg     r3,r3
  7d4:	4e 80 00 20 	blr
  7d8:	3d 40 3b 9a 	lis     r10,15258
  7dc:	61 4a c9 ff 	ori     r10,r10,51711
  7e0:	7c 09 50 40 	cmplw   r9,r10
  7e4:	41 81 ff 80 	bgt     764 <__c_kernel_clock_gettime+0xb8>
  7e8:	4b ff ff b4 	b       79c <__c_kernel_clock_gettime+0xf0>
  7ec:	71 2a 00 60 	andi.   r10,r9,96
  7f0:	40 82 00 14 	bne     804 <__c_kernel_clock_gettime+0x158>
  7f4:	71 29 00 10 	andi.   r9,r9,16
  7f8:	41 a2 ff cc 	beq     7c4 <__c_kernel_clock_gettime+0x118>
  7fc:	38 a5 00 f0 	addi    r5,r5,240
  800:	4b ff fe c4 	b       6c4 <__c_kernel_clock_gettime+0x18>
  804:	54 69 20 36 	rlwinm  r9,r3,4,0,27
  808:	7d 25 4a 14 	add     r9,r5,r9
  80c:	81 05 00 00 	lwz     r8,0(r5)
  810:	71 0a 00 01 	andi.   r10,r8,1
  814:	40 82 00 28 	bne     83c <__c_kernel_clock_gettime+0x190>
  818:	80 09 00 24 	lwz     r0,36(r9)
  81c:	81 49 00 2c 	lwz     r10,44(r9)
  820:	80 e5 00 00 	lwz     r7,0(r5)
  824:	7c 08 38 40 	cmplw   r8,r7
  828:	40 a2 ff e4 	bne     80c <__c_kernel_clock_gettime+0x160>
  82c:	90 04 00 00 	stw     r0,0(r4)
  830:	91 44 00 04 	stw     r10,4(r4)
  834:	38 60 00 00 	li      r3,0
  838:	4e 80 00 20 	blr
  83c:	4b ff ff d0 	b       80c <__c_kernel_clock_gettime+0x160>


> 
>> +	return ((u64)hi << 32) | lo;
>> +}
> 
> 
> What does the compiler do for just
> 
> static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
> 	return ns >> (shift & 31);
> }
> 

Worse:

000006ac <__c_kernel_clock_gettime>:
  6ac:	28 03 00 0f 	cmplwi  r3,15
  6b0:	41 81 01 30 	bgt     7e0 <__c_kernel_clock_gettime+0x134>
  6b4:	39 20 00 01 	li      r9,1
  6b8:	7d 29 18 30 	slw     r9,r9,r3
  6bc:	71 2a 08 83 	andi.   r10,r9,2179
  6c0:	41 82 01 48 	beq     808 <__c_kernel_clock_gettime+0x15c>
  6c4:	94 21 ff e0 	stwu    r1,-32(r1)
  6c8:	54 63 20 36 	rlwinm  r3,r3,4,0,27
  6cc:	93 81 00 10 	stw     r28,16(r1)
  6d0:	93 a1 00 14 	stw     r29,20(r1)
  6d4:	93 c1 00 18 	stw     r30,24(r1)
  6d8:	93 e1 00 1c 	stw     r31,28(r1)
  6dc:	7c 65 1a 14 	add     r3,r5,r3
  6e0:	80 c5 00 00 	lwz     r6,0(r5)
  6e4:	70 c7 00 01 	andi.   r7,r6,1
  6e8:	40 82 00 f4 	bne     7dc <__c_kernel_clock_gettime+0x130>
  6ec:	7d 2d 42 e6 	mftbu   r9
  6f0:	7d 0c 42 e6 	mftb    r8
  6f4:	7d 4d 42 e6 	mftbu   r10
  6f8:	7c 09 50 40 	cmplw   r9,r10
  6fc:	40 82 ff f0 	bne     6ec <__c_kernel_clock_gettime+0x40>
  700:	83 83 00 28 	lwz     r28,40(r3)
  704:	83 c3 00 2c 	lwz     r30,44(r3)
  708:	81 65 00 08 	lwz     r11,8(r5)
  70c:	81 45 00 0c 	lwz     r10,12(r5)
  710:	83 e5 00 18 	lwz     r31,24(r5)
  714:	81 85 00 1c 	lwz     r12,28(r5)
  718:	80 03 00 24 	lwz     r0,36(r3)
  71c:	83 a5 00 00 	lwz     r29,0(r5)
  720:	7c 06 e8 40 	cmplw   r6,r29
  724:	40 a2 ff bc 	bne     6e0 <__c_kernel_clock_gettime+0x34>
  728:	7d 0a 40 10 	subfc   r8,r10,r8
  72c:	7c cb 49 10 	subfe   r6,r11,r9
  730:	7c c6 f9 d6 	mullw   r6,r6,r31
  734:	7d 28 f8 16 	mulhwu  r9,r8,r31
  738:	7d 08 f9 d6 	mullw   r8,r8,r31
  73c:	55 8c 06 fe 	clrlwi  r12,r12,27
  740:	7f c8 f0 14 	addc    r30,r8,r30
  744:	7c c6 4a 14 	add     r6,r6,r9
  748:	7c c6 e1 14 	adde    r6,r6,r28
  74c:	34 6c ff e0 	addic.  r3,r12,-32
  750:	41 80 00 70 	blt     7c0 <__c_kernel_clock_gettime+0x114>
  754:	7c c9 1c 30 	srw     r9,r6,r3
  758:	39 00 00 00 	li      r8,0
  75c:	2c 08 00 00 	cmpwi   r8,0
  760:	41 82 00 94 	beq     7f4 <__c_kernel_clock_gettime+0x148>
  764:	3c c0 c4 65 	lis     r6,-15259
  768:	3c 60 3b 9a 	lis     r3,15258
  76c:	60 c6 36 00 	ori     r6,r6,13824
  770:	60 63 c9 ff 	ori     r3,r3,51711
  774:	7c a9 30 14 	addc    r5,r9,r6
  778:	7d 48 01 d4 	addme   r10,r8
  77c:	2c 0a 00 00 	cmpwi   r10,0
  780:	7d 48 53 78 	mr      r8,r10
  784:	7c a9 2b 78 	mr      r9,r5
  788:	38 e7 00 01 	addi    r7,r7,1
  78c:	40 82 ff e8 	bne     774 <__c_kernel_clock_gettime+0xc8>
  790:	7c 05 18 40 	cmplw   r5,r3
  794:	41 81 ff e0 	bgt     774 <__c_kernel_clock_gettime+0xc8>
  798:	7c 07 00 14 	addc    r0,r7,r0
  79c:	90 04 00 00 	stw     r0,0(r4)
  7a0:	91 24 00 04 	stw     r9,4(r4)
  7a4:	38 60 00 00 	li      r3,0
  7a8:	83 81 00 10 	lwz     r28,16(r1)
  7ac:	83 a1 00 14 	lwz     r29,20(r1)
  7b0:	83 c1 00 18 	lwz     r30,24(r1)
  7b4:	83 e1 00 1c 	lwz     r31,28(r1)
  7b8:	38 21 00 20 	addi    r1,r1,32
  7bc:	4e 80 00 20 	blr
  7c0:	54 c3 08 3c 	rlwinm  r3,r6,1,0,30
  7c4:	21 6c 00 1f 	subfic  r11,r12,31
  7c8:	7c 63 58 30 	slw     r3,r3,r11
  7cc:	7f c9 64 30 	srw     r9,r30,r12
  7d0:	7c 69 4b 78 	or      r9,r3,r9
  7d4:	7c c8 64 30 	srw     r8,r6,r12
  7d8:	4b ff ff 84 	b       75c <__c_kernel_clock_gettime+0xb0>
  7dc:	4b ff ff 04 	b       6e0 <__c_kernel_clock_gettime+0x34>
  7e0:	38 00 00 f6 	li      r0,246
  7e4:	44 00 00 02 	sc
  7e8:	40 a3 00 08 	bns     7f0 <__c_kernel_clock_gettime+0x144>
  7ec:	7c 63 00 d0 	neg     r3,r3
  7f0:	4e 80 00 20 	blr
  7f4:	3d 40 3b 9a 	lis     r10,15258
  7f8:	61 4a c9 ff 	ori     r10,r10,51711
  7fc:	7c 09 50 40 	cmplw   r9,r10
  800:	41 81 ff 64 	bgt     764 <__c_kernel_clock_gettime+0xb8>
  804:	4b ff ff 98 	b       79c <__c_kernel_clock_gettime+0xf0>
  808:	71 2a 00 60 	andi.   r10,r9,96
  80c:	40 82 00 14 	bne     820 <__c_kernel_clock_gettime+0x174>
  810:	71 29 00 10 	andi.   r9,r9,16
  814:	41 a2 ff cc 	beq     7e0 <__c_kernel_clock_gettime+0x134>
  818:	38 a5 00 f0 	addi    r5,r5,240
  81c:	4b ff fe a8 	b       6c4 <__c_kernel_clock_gettime+0x18>
  820:	54 69 20 36 	rlwinm  r9,r3,4,0,27
  824:	7d 25 4a 14 	add     r9,r5,r9
  828:	81 05 00 00 	lwz     r8,0(r5)
  82c:	71 0a 00 01 	andi.   r10,r8,1
  830:	40 82 00 28 	bne     858 <__c_kernel_clock_gettime+0x1ac>
  834:	80 09 00 24 	lwz     r0,36(r9)
  838:	81 49 00 2c 	lwz     r10,44(r9)
  83c:	80 e5 00 00 	lwz     r7,0(r5)
  840:	7c 08 38 40 	cmplw   r8,r7
  844:	40 a2 ff e4 	bne     828 <__c_kernel_clock_gettime+0x17c>
  848:	90 04 00 00 	stw     r0,0(r4)
  84c:	91 44 00 04 	stw     r10,4(r4)
  850:	38 60 00 00 	li      r3,0
  854:	4e 80 00 20 	blr
  858:	4b ff ff d0 	b       828 <__c_kernel_clock_gettime+0x17c>

Christophe
Christophe Leroy Aug. 5, 2020, 4:51 p.m. UTC | #3
Hi Again,

Le 05/08/2020 à 16:03, Segher Boessenkool a écrit :
> Hi!
> 
> On Wed, Aug 05, 2020 at 07:09:23AM +0000, Christophe Leroy wrote:
>> +/*
>> + * The macros sets two stack frames, one for the caller and one for the callee
>> + * because there are no requirement for the caller to set a stack frame when
>> + * calling VDSO so it may have omitted to set one, especially on PPC64
>> + */
> 
> If the caller follows the ABI, there always is a stack frame.  So what
> is going on?

Looks like it is not the case. See discussion at 
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/2a67c333893454868bbfda773ba4b01c20272a5d.1588079622.git.christophe.leroy@c-s.fr/

Seems like GCC uses the redzone and doesn't set a stack frame. I guess 
it doesn't know that the inline assembly contains a function call so it 
doesn't set the frame.


Christophe
Segher Boessenkool Aug. 5, 2020, 6:40 p.m. UTC | #4
Hi!

On Wed, Aug 05, 2020 at 04:40:16PM +0000, Christophe Leroy wrote:
> >It cannot optimise it because it does not know shift < 32.  The code
> >below is incorrect for shift equal to 32, fwiw.
> 
> Is there a way to tell it ?

Sure, for example the &31 should work (but it doesn't, with the GCC
version you used -- which version is that?)

> >What does the compiler do for just
> >
> >static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
> >	return ns >> (shift & 31);
> >}
> >
> 
> Worse:

I cannot make heads or tails of all that branch spaghetti, sorry.

>  73c:	55 8c 06 fe 	clrlwi  r12,r12,27
>  740:	7f c8 f0 14 	addc    r30,r8,r30
>  744:	7c c6 4a 14 	add     r6,r6,r9
>  748:	7c c6 e1 14 	adde    r6,r6,r28
>  74c:	34 6c ff e0 	addic.  r3,r12,-32
>  750:	41 80 00 70 	blt     7c0 <__c_kernel_clock_gettime+0x114>

This branch is always true.  Hrm.


Segher
Segher Boessenkool Aug. 5, 2020, 8:55 p.m. UTC | #5
Hi!

On Wed, Aug 05, 2020 at 06:51:44PM +0200, Christophe Leroy wrote:
> Le 05/08/2020 à 16:03, Segher Boessenkool a écrit :
> >On Wed, Aug 05, 2020 at 07:09:23AM +0000, Christophe Leroy wrote:
> >>+/*
> >>+ * The macros sets two stack frames, one for the caller and one for the 
> >>callee
> >>+ * because there are no requirement for the caller to set a stack frame 
> >>when
> >>+ * calling VDSO so it may have omitted to set one, especially on PPC64
> >>+ */
> >
> >If the caller follows the ABI, there always is a stack frame.  So what
> >is going on?
> 
> Looks like it is not the case. See discussion at 
> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/2a67c333893454868bbfda773ba4b01c20272a5d.1588079622.git.christophe.leroy@c-s.fr/
> 
> Seems like GCC uses the redzone and doesn't set a stack frame. I guess 
> it doesn't know that the inline assembly contains a function call so it 
> doesn't set the frame.

Yes, that is the problem.  See
https://gcc.gnu.org/onlinedocs/gcc-10.2.0/gcc/Extended-Asm.html#AssemblerTemplate
where this is (briefly) discussed:
  "Accessing data from C programs without using input/output operands
  (such as by using global symbols directly from the assembler
  template) may not work as expected. Similarly, calling functions
  directly from an assembler template requires a detailed understanding
  of the target assembler and ABI."

I don't know of a good way to tell GCC some function needs a frame (that
is, one that doesn't result in extra code other than to set up the
frame).  I'll think about it.


Segher
Christophe Leroy Aug. 6, 2020, 5:46 a.m. UTC | #6
Hi,

On 08/05/2020 06:40 PM, Segher Boessenkool wrote:
> Hi!
> 
> On Wed, Aug 05, 2020 at 04:40:16PM +0000, Christophe Leroy wrote:
>>> It cannot optimise it because it does not know shift < 32.  The code
>>> below is incorrect for shift equal to 32, fwiw.
>>
>> Is there a way to tell it ?
> 
> Sure, for example the &31 should work (but it doesn't, with the GCC
> version you used -- which version is that?)

GCC 10.1

> 
>>> What does the compiler do for just
>>>
>>> static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
>>> 	return ns >> (shift & 31);
>>> }
>>>
>>
>> Worse:
> 
> I cannot make heads or tails of all that branch spaghetti, sorry.
> 
>>   73c:	55 8c 06 fe 	clrlwi  r12,r12,27
>>   740:	7f c8 f0 14 	addc    r30,r8,r30
>>   744:	7c c6 4a 14 	add     r6,r6,r9
>>   748:	7c c6 e1 14 	adde    r6,r6,r28
>>   74c:	34 6c ff e0 	addic.  r3,r12,-32
>>   750:	41 80 00 70 	blt     7c0 <__c_kernel_clock_gettime+0x114>
> 
> This branch is always true.  Hrm.

As a standalone function:

With your suggestion:

000006ac <vdso_shift_ns>:
  6ac:	54 a5 06 fe 	clrlwi  r5,r5,27
  6b0:	35 25 ff e0 	addic.  r9,r5,-32
  6b4:	41 80 00 10 	blt     6c4 <vdso_shift_ns+0x18>
  6b8:	7c 64 4c 30 	srw     r4,r3,r9
  6bc:	38 60 00 00 	li      r3,0
  6c0:	4e 80 00 20 	blr
  6c4:	54 69 08 3c 	rlwinm  r9,r3,1,0,30
  6c8:	21 45 00 1f 	subfic  r10,r5,31
  6cc:	7c 84 2c 30 	srw     r4,r4,r5
  6d0:	7d 29 50 30 	slw     r9,r9,r10
  6d4:	7c 63 2c 30 	srw     r3,r3,r5
  6d8:	7d 24 23 78 	or      r4,r9,r4
  6dc:	4e 80 00 20 	blr


With the version as is in my series:

000006ac <vdso_shift_ns>:
  6ac:	21 25 00 20 	subfic  r9,r5,32
  6b0:	7c 69 48 30 	slw     r9,r3,r9
  6b4:	7c 84 2c 30 	srw     r4,r4,r5
  6b8:	7d 24 23 78 	or      r4,r9,r4
  6bc:	7c 63 2c 30 	srw     r3,r3,r5
  6c0:	4e 80 00 20 	blr


Christophe
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/clocksource.h b/arch/powerpc/include/asm/clocksource.h
new file mode 100644
index 000000000000..482185566b0c
--- /dev/null
+++ b/arch/powerpc/include/asm/clocksource.h
@@ -0,0 +1,7 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_CLOCKSOURCE_H
+#define _ASM_CLOCKSOURCE_H
+
+#include <asm/vdso/clocksource.h>
+
+#endif
diff --git a/arch/powerpc/include/asm/vdso/clocksource.h b/arch/powerpc/include/asm/vdso/clocksource.h
new file mode 100644
index 000000000000..ec5d672d2569
--- /dev/null
+++ b/arch/powerpc/include/asm/vdso/clocksource.h
@@ -0,0 +1,7 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_VDSOCLOCKSOURCE_H
+#define __ASM_VDSOCLOCKSOURCE_H
+
+#define VDSO_ARCH_CLOCKMODES	VDSO_CLOCKMODE_ARCHTIMER
+
+#endif
diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h
new file mode 100644
index 000000000000..e2c462796a22
--- /dev/null
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -0,0 +1,185 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_VDSO_GETTIMEOFDAY_H
+#define __ASM_VDSO_GETTIMEOFDAY_H
+
+#include <asm/ptrace.h>
+
+#ifdef __ASSEMBLY__
+
+/*
+ * The macros sets two stack frames, one for the caller and one for the callee
+ * because there are no requirement for the caller to set a stack frame when
+ * calling VDSO so it may have omitted to set one, especially on PPC64
+ */
+
+.macro cvdso_call funct
+  .cfi_startproc
+	PPC_STLU	r1, -STACK_FRAME_OVERHEAD(r1)
+	mflr		r0
+  .cfi_register lr, r0
+	PPC_STLU	r1, -STACK_FRAME_OVERHEAD(r1)
+	PPC_STL		r0, STACK_FRAME_OVERHEAD + PPC_LR_STKOFF(r1)
+	get_datapage	r5, r0
+	addi		r5, r5, VDSO_DATA_OFFSET
+	bl		\funct
+	PPC_LL		r0, STACK_FRAME_OVERHEAD + PPC_LR_STKOFF(r1)
+	cmpwi		r3, 0
+	mtlr		r0
+  .cfi_restore lr
+	addi		r1, r1, 2 * STACK_FRAME_OVERHEAD
+	crclr		so
+	beqlr+
+	crset		so
+	neg		r3, r3
+	blr
+  .cfi_endproc
+.endm
+
+.macro cvdso_call_time funct
+  .cfi_startproc
+	PPC_STLU	r1, -STACK_FRAME_OVERHEAD(r1)
+	mflr		r0
+  .cfi_register lr, r0
+	PPC_STLU	r1, -STACK_FRAME_OVERHEAD(r1)
+	PPC_STL		r0, STACK_FRAME_OVERHEAD + PPC_LR_STKOFF(r1)
+	get_datapage	r4, r0
+	addi		r4, r4, VDSO_DATA_OFFSET
+	bl		\funct
+	PPC_LL		r0, STACK_FRAME_OVERHEAD + PPC_LR_STKOFF(r1)
+	crclr		so
+	mtlr		r0
+  .cfi_restore lr
+	addi		r1, r1, 2 * STACK_FRAME_OVERHEAD
+	blr
+  .cfi_endproc
+.endm
+
+#else
+
+#include <asm/time.h>
+#include <asm/unistd.h>
+#include <uapi/linux/time.h>
+
+#define VDSO_HAS_CLOCK_GETRES		1
+
+#define VDSO_HAS_TIME			1
+
+static __always_inline int do_syscall_2(const unsigned long _r0, const unsigned long _r3,
+					const unsigned long _r4)
+{
+	register long r0 asm("r0") = _r0;
+	register unsigned long r3 asm("r3") = _r3;
+	register unsigned long r4 asm("r4") = _r4;
+	register int ret asm ("r3");
+
+	asm volatile(
+		"       sc\n"
+		"	bns+	1f\n"
+		"	neg	%0, %0\n"
+		"1:\n"
+	: "=r" (ret), "+r" (r4), "+r" (r0)
+	: "r" (r3)
+	: "memory", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "cr0", "ctr");
+
+	return ret;
+}
+
+static __always_inline
+int gettimeofday_fallback(struct __kernel_old_timeval *_tv, struct timezone *_tz)
+{
+	return do_syscall_2(__NR_gettimeofday, (unsigned long)_tv, (unsigned long)_tz);
+}
+
+static __always_inline
+int clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+	return do_syscall_2(__NR_clock_gettime, _clkid, (unsigned long)_ts);
+}
+
+static __always_inline
+int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+	return do_syscall_2(__NR_clock_getres, _clkid, (unsigned long)_ts);
+}
+
+#ifdef CONFIG_VDSO32
+
+#define BUILD_VDSO32		1
+
+static __always_inline
+int clock_gettime32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
+{
+	return do_syscall_2(__NR_clock_gettime, _clkid, (unsigned long)_ts);
+}
+
+static __always_inline
+int clock_getres32_fallback(clockid_t _clkid, struct old_timespec32 *_ts)
+{
+	return do_syscall_2(__NR_clock_getres, _clkid, (unsigned long)_ts);
+}
+#endif
+
+static __always_inline u64 __arch_get_hw_counter(s32 clock_mode)
+{
+	return get_tb();
+}
+
+const struct vdso_data *__arch_get_vdso_data(void);
+
+static inline bool vdso_clocksource_ok(const struct vdso_data *vd)
+{
+	return !__USE_RTC();
+}
+#define vdso_clocksource_ok vdso_clocksource_ok
+
+/*
+ * powerpc specific delta calculation.
+ *
+ * This variant removes the masking of the subtraction because the
+ * clocksource mask of all VDSO capable clocksources on powerpc is U64_MAX
+ * which would result in a pointless operation. The compiler cannot
+ * optimize it away as the mask comes from the vdso data and is not compile
+ * time constant.
+ */
+static __always_inline u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
+{
+	return (cycles - last) * mult;
+}
+#define vdso_calc_delta vdso_calc_delta
+
+#ifndef __powerpc64__
+static __always_inline u64 vdso_shift_ns(u64 ns, unsigned long shift)
+{
+	u32 hi = ns >> 32;
+	u32 lo = ns;
+
+	lo >>= shift;
+	lo |= hi << (32 - shift);
+	hi >>= shift;
+
+	if (likely(hi == 0))
+		return lo;
+
+	return ((u64)hi << 32) | lo;
+}
+#define vdso_shift_ns vdso_shift_ns
+#endif
+
+#ifdef __powerpc64__
+int __c_kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts,
+			     const struct vdso_data *vd);
+int __c_kernel_clock_getres(clockid_t clock_id, struct __kernel_timespec *res,
+			    const struct vdso_data *vd);
+#else
+int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts,
+			     const struct vdso_data *vd);
+int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res,
+			    const struct vdso_data *vd);
+#endif
+int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz,
+			    const struct vdso_data *vd);
+__kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time,
+				    const struct vdso_data *vd);
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/powerpc/kernel/vdso32/vgettimeofday.c b/arch/powerpc/kernel/vdso32/vgettimeofday.c
new file mode 100644
index 000000000000..0b9ab4c22ef2
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/vgettimeofday.c
@@ -0,0 +1,29 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Powerpc userspace implementations of gettimeofday() and similar.
+ */
+#include <linux/time.h>
+#include <linux/types.h>
+
+int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts,
+			     const struct vdso_data *vd)
+{
+	return __cvdso_clock_gettime32_data(vd, clock, ts);
+}
+
+int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz,
+			    const struct vdso_data *vd)
+{
+	return __cvdso_gettimeofday_data(vd, tv, tz);
+}
+
+int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res,
+			    const struct vdso_data *vd)
+{
+	return __cvdso_clock_getres_time32_data(vd, clock_id, res);
+}
+
+__kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, const struct vdso_data *vd)
+{
+	return __cvdso_time_data(vd, time);
+}
diff --git a/arch/powerpc/kernel/vdso64/vgettimeofday.c b/arch/powerpc/kernel/vdso64/vgettimeofday.c
new file mode 100644
index 000000000000..5b5500058344
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/vgettimeofday.c
@@ -0,0 +1,29 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Powerpc userspace implementations of gettimeofday() and similar.
+ */
+#include <linux/time.h>
+#include <linux/types.h>
+
+int __c_kernel_clock_gettime(clockid_t clock, struct __kernel_timespec *ts,
+			     const struct vdso_data *vd)
+{
+	return __cvdso_clock_gettime_data(vd, clock, ts);
+}
+
+int __c_kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz,
+			    const struct vdso_data *vd)
+{
+	return __cvdso_gettimeofday_data(vd, tv, tz);
+}
+
+int __c_kernel_clock_getres(clockid_t clock_id, struct __kernel_timespec *res,
+			    const struct vdso_data *vd)
+{
+	return __cvdso_clock_getres_data(vd, clock_id, res);
+}
+
+__kernel_old_time_t __c_kernel_time(__kernel_old_time_t *time, const struct vdso_data *vd)
+{
+	return __cvdso_time_data(vd, time);
+}