diff mbox series

[v8,2/8] powerpc/vdso: Remove __kernel_datapage_offset and simplify __get_datapage()

Message ID 0d2201efe3c7727f2acc718aefd7c5bb22c66c57.1588079622.git.christophe.leroy@c-s.fr (mailing list archive)
State Superseded
Headers show
Series powerpc: switch VDSO to C implementation | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (54dc28ff5e0b3585224d49a31b53e030342ca5c3)
snowpatch_ozlabs/checkpatch warning total: 1 errors, 0 warnings, 0 checks, 174 lines checked
snowpatch_ozlabs/needsstable success Patch has no Fixes tags

Commit Message

Christophe Leroy April 28, 2020, 1:16 p.m. UTC
The VDSO datapage and the text pages are always located immediately
next to each other, so it can be hardcoded without an indirection
through __kernel_datapage_offset

In order to ease things, move the data page in front like other
arches, that way there is no need to know the size of the library
to locate the data page.

Before:
clock-getres-realtime-coarse:    vdso: 714 nsec/call
clock-gettime-realtime-coarse:    vdso: 792 nsec/call
clock-gettime-realtime:    vdso: 1243 nsec/call

After:
clock-getres-realtime-coarse:    vdso: 699 nsec/call
clock-gettime-realtime-coarse:    vdso: 784 nsec/call
clock-gettime-realtime:    vdso: 1231 nsec/call

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
v7:
- Moved the removal of the tmp param of __get_datapage()
in a subsequent patch
- Included the addition of the offset param to __get_datapage()
in the further preparatory patch
---
 arch/powerpc/include/asm/vdso_datapage.h |  8 ++--
 arch/powerpc/kernel/vdso.c               | 53 ++++--------------------
 arch/powerpc/kernel/vdso32/datapage.S    |  3 --
 arch/powerpc/kernel/vdso32/vdso32.lds.S  |  7 +---
 arch/powerpc/kernel/vdso64/datapage.S    |  3 --
 arch/powerpc/kernel/vdso64/vdso64.lds.S  |  7 +---
 6 files changed, 16 insertions(+), 65 deletions(-)

Comments

Michael Ellerman July 16, 2020, 2:59 a.m. UTC | #1
Christophe Leroy <christophe.leroy@c-s.fr> writes:
> The VDSO datapage and the text pages are always located immediately
> next to each other, so it can be hardcoded without an indirection
> through __kernel_datapage_offset
>
> In order to ease things, move the data page in front like other
> arches, that way there is no need to know the size of the library
> to locate the data page.
>
> Before:
> clock-getres-realtime-coarse:    vdso: 714 nsec/call
> clock-gettime-realtime-coarse:    vdso: 792 nsec/call
> clock-gettime-realtime:    vdso: 1243 nsec/call
>
> After:
> clock-getres-realtime-coarse:    vdso: 699 nsec/call
> clock-gettime-realtime-coarse:    vdso: 784 nsec/call
> clock-gettime-realtime:    vdso: 1231 nsec/call
>
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> ---
> v7:
> - Moved the removal of the tmp param of __get_datapage()
> in a subsequent patch
> - Included the addition of the offset param to __get_datapage()
> in the further preparatory patch
> ---
>  arch/powerpc/include/asm/vdso_datapage.h |  8 ++--
>  arch/powerpc/kernel/vdso.c               | 53 ++++--------------------
>  arch/powerpc/kernel/vdso32/datapage.S    |  3 --
>  arch/powerpc/kernel/vdso32/vdso32.lds.S  |  7 +---
>  arch/powerpc/kernel/vdso64/datapage.S    |  3 --
>  arch/powerpc/kernel/vdso64/vdso64.lds.S  |  7 +---
>  6 files changed, 16 insertions(+), 65 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h
> index b9ef6cf50ea5..11886467dfdf 100644
> --- a/arch/powerpc/include/asm/vdso_datapage.h
> +++ b/arch/powerpc/include/asm/vdso_datapage.h
> @@ -118,10 +118,12 @@ extern struct vdso_data *vdso_data;
>  
>  .macro get_datapage ptr, tmp
>  	bcl	20, 31, .+4
> +999:
>  	mflr	\ptr
> -	addi	\ptr, \ptr, (__kernel_datapage_offset - (.-4))@l
> -	lwz	\tmp, 0(\ptr)
> -	add	\ptr, \tmp, \ptr
> +#if CONFIG_PPC_PAGE_SHIFT > 14
> +	addis	\ptr, \ptr, (_vdso_datapage - 999b)@ha
> +#endif
> +	addi	\ptr, \ptr, (_vdso_datapage - 999b)@l
>  .endm
>  
>  #endif /* __ASSEMBLY__ */
> diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
> index f38f26e844b6..d33fa22ddbed 100644
> --- a/arch/powerpc/kernel/vdso.c
> +++ b/arch/powerpc/kernel/vdso.c
> @@ -190,7 +190,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
>  	 * install_special_mapping or the perf counter mmap tracking code
>  	 * will fail to recognise it as a vDSO (since arch_vma_name fails).
>  	 */
> -	current->mm->context.vdso_base = vdso_base;
> +	current->mm->context.vdso_base = vdso_base + PAGE_SIZE;

I merged this but then realised it breaks the display of the vdso in /proc/self/maps.

ie. the vdso vma gets no name:

  # cat /proc/self/maps
  110f90000-110fa0000 r-xp 00000000 08:03 17021844                         /usr/bin/cat
  110fa0000-110fb0000 r--p 00000000 08:03 17021844                         /usr/bin/cat
  110fb0000-110fc0000 rw-p 00010000 08:03 17021844                         /usr/bin/cat
  126000000-126030000 rw-p 00000000 00:00 0                                [heap]
  7fffa8790000-7fffa87d0000 rw-p 00000000 00:00 0 
  7fffa87d0000-7fffa8830000 r--p 00000000 08:03 17521786                   /usr/lib/locale/en_AU.utf8/LC_CTYPE
  7fffa8830000-7fffa8840000 r--p 00000000 08:03 16958337                   /usr/lib/locale/en_AU.utf8/LC_NUMERIC
  7fffa8840000-7fffa8850000 r--p 00000000 08:03 8501358                    /usr/lib/locale/en_AU.utf8/LC_TIME
  7fffa8850000-7fffa8ad0000 r--p 00000000 08:03 16870886                   /usr/lib/locale/en_AU.utf8/LC_COLLATE
  7fffa8ad0000-7fffa8ae0000 r--p 00000000 08:03 8509433                    /usr/lib/locale/en_AU.utf8/LC_MONETARY
  7fffa8ae0000-7fffa8af0000 r--p 00000000 08:03 25383753                   /usr/lib/locale/en_AU.utf8/LC_MESSAGES/SYS_LC_MESSAGES
  7fffa8af0000-7fffa8b00000 r--p 00000000 08:03 17521790                   /usr/lib/locale/en_AU.utf8/LC_PAPER
  7fffa8b00000-7fffa8b10000 r--p 00000000 08:03 8501354                    /usr/lib/locale/en_AU.utf8/LC_NAME
  7fffa8b10000-7fffa8b20000 r--p 00000000 08:03 8509431                    /usr/lib/locale/en_AU.utf8/LC_ADDRESS
  7fffa8b20000-7fffa8b30000 r--p 00000000 08:03 8509434                    /usr/lib/locale/en_AU.utf8/LC_TELEPHONE
  7fffa8b30000-7fffa8b40000 r--p 00000000 08:03 17521787                   /usr/lib/locale/en_AU.utf8/LC_MEASUREMENT
  7fffa8b40000-7fffa8b50000 r--s 00000000 08:03 25623315                   /usr/lib64/gconv/gconv-modules.cache
  7fffa8b50000-7fffa8d40000 r-xp 00000000 08:03 25383789                   /usr/lib64/libc-2.30.so
  7fffa8d40000-7fffa8d50000 r--p 001e0000 08:03 25383789                   /usr/lib64/libc-2.30.so
  7fffa8d50000-7fffa8d60000 rw-p 001f0000 08:03 25383789                   /usr/lib64/libc-2.30.so
  7fffa8d60000-7fffa8d70000 r--p 00000000 08:03 8509432                    /usr/lib/locale/en_AU.utf8/LC_IDENTIFICATION
  7fffa8d70000-7fffa8d90000 r-xp 00000000 00:00 0						<--- missing
  7fffa8d90000-7fffa8dc0000 r-xp 00000000 08:03 25383781                   /usr/lib64/ld-2.30.so
  7fffa8dc0000-7fffa8dd0000 r--p 00020000 08:03 25383781                   /usr/lib64/ld-2.30.so
  7fffa8dd0000-7fffa8de0000 rw-p 00030000 08:03 25383781                   /usr/lib64/ld-2.30.so
  7fffc31c0000-7fffc31f0000 rw-p 00000000 00:00 0                          [stack]


And it's also going to break the logic in arch_unmap() to detect if
we're unmapping (part of) the VDSO. And it will break arch_remap() too.

And the logic to recognise the signal trampoline in
arch/powerpc/perf/callchain_*.c as well.

So I'm going to rebase and drop this for now.

Basically we have a bunch of places that assume that vdso_base is == the
start of the VDSO vma, and also that the code starts there. So that will
need some work to tease out all those assumptions and make them work
with this change.

cheers
Christophe Leroy Aug. 4, 2020, 11:17 a.m. UTC | #2
On 07/16/2020 02:59 AM, Michael Ellerman wrote:
> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>> The VDSO datapage and the text pages are always located immediately
>> next to each other, so it can be hardcoded without an indirection
>> through __kernel_datapage_offset
>>
>> In order to ease things, move the data page in front like other
>> arches, that way there is no need to know the size of the library
>> to locate the data page.
>>
>> Before:
>> clock-getres-realtime-coarse:    vdso: 714 nsec/call
>> clock-gettime-realtime-coarse:    vdso: 792 nsec/call
>> clock-gettime-realtime:    vdso: 1243 nsec/call
>>
>> After:
>> clock-getres-realtime-coarse:    vdso: 699 nsec/call
>> clock-gettime-realtime-coarse:    vdso: 784 nsec/call
>> clock-gettime-realtime:    vdso: 1231 nsec/call
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
>> ---
>> v7:
>> - Moved the removal of the tmp param of __get_datapage()
>> in a subsequent patch
>> - Included the addition of the offset param to __get_datapage()
>> in the further preparatory patch
>> ---
>>   arch/powerpc/include/asm/vdso_datapage.h |  8 ++--
>>   arch/powerpc/kernel/vdso.c               | 53 ++++--------------------
>>   arch/powerpc/kernel/vdso32/datapage.S    |  3 --
>>   arch/powerpc/kernel/vdso32/vdso32.lds.S  |  7 +---
>>   arch/powerpc/kernel/vdso64/datapage.S    |  3 --
>>   arch/powerpc/kernel/vdso64/vdso64.lds.S  |  7 +---
>>   6 files changed, 16 insertions(+), 65 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h
>> index b9ef6cf50ea5..11886467dfdf 100644
>> --- a/arch/powerpc/include/asm/vdso_datapage.h
>> +++ b/arch/powerpc/include/asm/vdso_datapage.h
>> @@ -118,10 +118,12 @@ extern struct vdso_data *vdso_data;
>>   
>>   .macro get_datapage ptr, tmp
>>   	bcl	20, 31, .+4
>> +999:
>>   	mflr	\ptr
>> -	addi	\ptr, \ptr, (__kernel_datapage_offset - (.-4))@l
>> -	lwz	\tmp, 0(\ptr)
>> -	add	\ptr, \tmp, \ptr
>> +#if CONFIG_PPC_PAGE_SHIFT > 14
>> +	addis	\ptr, \ptr, (_vdso_datapage - 999b)@ha
>> +#endif
>> +	addi	\ptr, \ptr, (_vdso_datapage - 999b)@l
>>   .endm
>>   
>>   #endif /* __ASSEMBLY__ */
>> diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
>> index f38f26e844b6..d33fa22ddbed 100644
>> --- a/arch/powerpc/kernel/vdso.c
>> +++ b/arch/powerpc/kernel/vdso.c
>> @@ -190,7 +190,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
>>   	 * install_special_mapping or the perf counter mmap tracking code
>>   	 * will fail to recognise it as a vDSO (since arch_vma_name fails).
>>   	 */
>> -	current->mm->context.vdso_base = vdso_base;
>> +	current->mm->context.vdso_base = vdso_base + PAGE_SIZE;
> 
> I merged this but then realised it breaks the display of the vdso in /proc/self/maps.
> 
> ie. the vdso vma gets no name:
> 
>    # cat /proc/self/maps
>    110f90000-110fa0000 r-xp 00000000 08:03 17021844                         /usr/bin/cat
>    110fa0000-110fb0000 r--p 00000000 08:03 17021844                         /usr/bin/cat
>    110fb0000-110fc0000 rw-p 00010000 08:03 17021844                         /usr/bin/cat
>    126000000-126030000 rw-p 00000000 00:00 0                                [heap]
>    7fffa8790000-7fffa87d0000 rw-p 00000000 00:00 0
>    7fffa87d0000-7fffa8830000 r--p 00000000 08:03 17521786                   /usr/lib/locale/en_AU.utf8/LC_CTYPE
>    7fffa8830000-7fffa8840000 r--p 00000000 08:03 16958337                   /usr/lib/locale/en_AU.utf8/LC_NUMERIC
>    7fffa8840000-7fffa8850000 r--p 00000000 08:03 8501358                    /usr/lib/locale/en_AU.utf8/LC_TIME
>    7fffa8850000-7fffa8ad0000 r--p 00000000 08:03 16870886                   /usr/lib/locale/en_AU.utf8/LC_COLLATE
>    7fffa8ad0000-7fffa8ae0000 r--p 00000000 08:03 8509433                    /usr/lib/locale/en_AU.utf8/LC_MONETARY
>    7fffa8ae0000-7fffa8af0000 r--p 00000000 08:03 25383753                   /usr/lib/locale/en_AU.utf8/LC_MESSAGES/SYS_LC_MESSAGES
>    7fffa8af0000-7fffa8b00000 r--p 00000000 08:03 17521790                   /usr/lib/locale/en_AU.utf8/LC_PAPER
>    7fffa8b00000-7fffa8b10000 r--p 00000000 08:03 8501354                    /usr/lib/locale/en_AU.utf8/LC_NAME
>    7fffa8b10000-7fffa8b20000 r--p 00000000 08:03 8509431                    /usr/lib/locale/en_AU.utf8/LC_ADDRESS
>    7fffa8b20000-7fffa8b30000 r--p 00000000 08:03 8509434                    /usr/lib/locale/en_AU.utf8/LC_TELEPHONE
>    7fffa8b30000-7fffa8b40000 r--p 00000000 08:03 17521787                   /usr/lib/locale/en_AU.utf8/LC_MEASUREMENT
>    7fffa8b40000-7fffa8b50000 r--s 00000000 08:03 25623315                   /usr/lib64/gconv/gconv-modules.cache
>    7fffa8b50000-7fffa8d40000 r-xp 00000000 08:03 25383789                   /usr/lib64/libc-2.30.so
>    7fffa8d40000-7fffa8d50000 r--p 001e0000 08:03 25383789                   /usr/lib64/libc-2.30.so
>    7fffa8d50000-7fffa8d60000 rw-p 001f0000 08:03 25383789                   /usr/lib64/libc-2.30.so
>    7fffa8d60000-7fffa8d70000 r--p 00000000 08:03 8509432                    /usr/lib/locale/en_AU.utf8/LC_IDENTIFICATION
>    7fffa8d70000-7fffa8d90000 r-xp 00000000 00:00 0						<--- missing
>    7fffa8d90000-7fffa8dc0000 r-xp 00000000 08:03 25383781                   /usr/lib64/ld-2.30.so
>    7fffa8dc0000-7fffa8dd0000 r--p 00020000 08:03 25383781                   /usr/lib64/ld-2.30.so
>    7fffa8dd0000-7fffa8de0000 rw-p 00030000 08:03 25383781                   /usr/lib64/ld-2.30.so
>    7fffc31c0000-7fffc31f0000 rw-p 00000000 00:00 0                          [stack]
> 
> 
> And it's also going to break the logic in arch_unmap() to detect if
> we're unmapping (part of) the VDSO. And it will break arch_remap() too.
> 
> And the logic to recognise the signal trampoline in
> arch/powerpc/perf/callchain_*.c as well.

I don't think it breaks that one, because ->vdsobase is still the start 
of text.

> 
> So I'm going to rebase and drop this for now.
> 
> Basically we have a bunch of places that assume that vdso_base is == the
> start of the VDSO vma, and also that the code starts there. So that will
> need some work to tease out all those assumptions and make them work
> with this change.

Ok, one day I need to look at it in more details and see how other 
architectures handle it etc ...

As the benefit is only 2 CPU cycles, I'll drop it for now.

Christophe
Christophe Leroy Aug. 25, 2020, 2:15 p.m. UTC | #3
Le 04/08/2020 à 13:17, Christophe Leroy a écrit :
> 
> 
> On 07/16/2020 02:59 AM, Michael Ellerman wrote:
>> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>>> The VDSO datapage and the text pages are always located immediately
>>> next to each other, so it can be hardcoded without an indirection
>>> through __kernel_datapage_offset
>>>
>>> In order to ease things, move the data page in front like other
>>> arches, that way there is no need to know the size of the library
>>> to locate the data page.
>>>
[...]

>>
>> I merged this but then realised it breaks the display of the vdso in 
>> /proc/self/maps.
>>
>> ie. the vdso vma gets no name:
>>
>>    # cat /proc/self/maps

[...]

>>
>>
>> And it's also going to break the logic in arch_unmap() to detect if
>> we're unmapping (part of) the VDSO. And it will break arch_remap() too.
>>
>> And the logic to recognise the signal trampoline in
>> arch/powerpc/perf/callchain_*.c as well.
> 
> I don't think it breaks that one, because ->vdsobase is still the start 
> of text.
> 
>>
>> So I'm going to rebase and drop this for now.
>>
>> Basically we have a bunch of places that assume that vdso_base is == the
>> start of the VDSO vma, and also that the code starts there. So that will
>> need some work to tease out all those assumptions and make them work
>> with this change.
> 
> Ok, one day I need to look at it in more details and see how other 
> architectures handle it etc ...
> 

I just sent out a series which switches powerpc to the new 
_install_special_mapping() API, the one powerpc uses being deprecated 
since commit a62c34bd2a8a ("x86, mm: Improve _install_special_mapping
and fix x86 vdso naming")

arch_remap() gets replaced by vdso_remap()

For arch_unmap(), I'm wondering how/what other architectures do, because 
powerpc seems to be the only one to erase the vdso context pointer when 
unmapping the vdso. So far I updated it to take into account the pages 
switch.

Everything else is not impacted because our vdso_base is still the base 
of the text and that's what those things (signal trampoline, callchain, 
...) expect.

Maybe we should change it to 'void *vdso' in the same way as other 
architectures, as it is not anymore the exact vdso_base but the start of 
VDSO text.

Note that the series applies on top of the generic C VDSO implementation 
series. However all but the last commit cleanly apply without that 
series. As that last commit is just an afterwork cleanup, it can come in 
a second step.

Christophe
Michael Ellerman Aug. 26, 2020, 1:58 p.m. UTC | #4
Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> Le 04/08/2020 à 13:17, Christophe Leroy a écrit :
>> On 07/16/2020 02:59 AM, Michael Ellerman wrote:
>>> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>>>> The VDSO datapage and the text pages are always located immediately
>>>> next to each other, so it can be hardcoded without an indirection
>>>> through __kernel_datapage_offset
>>>>
>>>> In order to ease things, move the data page in front like other
>>>> arches, that way there is no need to know the size of the library
>>>> to locate the data page.
>>>>
> [...]
>
>>>
>>> I merged this but then realised it breaks the display of the vdso in 
>>> /proc/self/maps.
>>>
>>> ie. the vdso vma gets no name:
>>>
>>>    # cat /proc/self/maps
>
> [...]
>
>>> And it's also going to break the logic in arch_unmap() to detect if
>>> we're unmapping (part of) the VDSO. And it will break arch_remap() too.
>>>
>>> And the logic to recognise the signal trampoline in
>>> arch/powerpc/perf/callchain_*.c as well.
>> 
>> I don't think it breaks that one, because ->vdsobase is still the start 
>> of text.
>> 
>>>
>>> So I'm going to rebase and drop this for now.
>>>
>>> Basically we have a bunch of places that assume that vdso_base is == the
>>> start of the VDSO vma, and also that the code starts there. So that will
>>> need some work to tease out all those assumptions and make them work
>>> with this change.
>> 
>> Ok, one day I need to look at it in more details and see how other 
>> architectures handle it etc ...
>> 
>
> I just sent out a series which switches powerpc to the new 
> _install_special_mapping() API, the one powerpc uses being deprecated 
> since commit a62c34bd2a8a ("x86, mm: Improve _install_special_mapping
> and fix x86 vdso naming")
>
> arch_remap() gets replaced by vdso_remap()
>
> For arch_unmap(), I'm wondering how/what other architectures do, because 
> powerpc seems to be the only one to erase the vdso context pointer when 
> unmapping the vdso.

Yeah. The original unmap/remap stuff was added for CRIU, which I thought
people tested on other architectures (more than powerpc even).

Possibly no one really cares about vdso unmap though, vs just moving the
vdso.

We added a test for vdso unmap recently because it happened to trigger a
KAUP failure, and someone actually hit it & reported it.

Running that test on arm64 segfaults:

  # ./sigreturn_vdso 
  VDSO is at 0xffff8191f000-0xffff8191ffff (4096 bytes)
  Signal delivered OK with VDSO mapped
  VDSO moved to 0xffff8191a000-0xffff8191afff (4096 bytes)
  Signal delivered OK with VDSO moved
  Unmapped VDSO
  Remapped the stack executable
  [   48.556191] potentially unexpected fatal signal 11.
  [   48.556752] CPU: 0 PID: 140 Comm: sigreturn_vdso Not tainted 5.9.0-rc2-00057-g2ac69819ba9e #190
  [   48.556990] Hardware name: linux,dummy-virt (DT)
  [   48.557336] pstate: 60001000 (nZCv daif -PAN -UAO BTYPE=--)
  [   48.557475] pc : 0000ffff8191a7bc
  [   48.557603] lr : 0000ffff8191a7bc
  [   48.557697] sp : 0000ffffc13c9e90
  [   48.557873] x29: 0000ffffc13cb0e0 x28: 0000000000000000 
  [   48.558201] x27: 0000000000000000 x26: 0000000000000000 
  [   48.558337] x25: 0000000000000000 x24: 0000000000000000 
  [   48.558754] x23: 0000000000000000 x22: 0000000000000000 
  [   48.558893] x21: 00000000004009b0 x20: 0000000000000000 
  [   48.559046] x19: 0000000000400ff0 x18: 0000000000000000 
  [   48.559180] x17: 0000ffff817da300 x16: 0000000000412010 
  [   48.559312] x15: 0000000000000000 x14: 000000000000001c 
  [   48.559443] x13: 656c626174756365 x12: 7865206b63617473 
  [   48.559625] x11: 0000000000000003 x10: 0101010101010101 
  [   48.559828] x9 : 0000ffff818afda8 x8 : 0000000000000081 
  [   48.559973] x7 : 6174732065687420 x6 : 64657070616d6552 
  [   48.560115] x5 : 000000000e0388bd x4 : 000000000040135d 
  [   48.560270] x3 : 0000000000000000 x2 : 0000000000000001 
  [   48.560412] x1 : 0000000000000003 x0 : 00000000004120b8 
  Segmentation fault
  #

So I think we need to keep the unmap hook. Maybe it should be handled by
the special_mapping stuff generically.

cheers
Dmitry Safonov Aug. 27, 2020, 8:34 p.m. UTC | #5
Hello,

On Wed, 26 Aug 2020 at 15:39, Michael Ellerman <mpe@ellerman.id.au> wrote:
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
[..]
> > arch_remap() gets replaced by vdso_remap()
> >
> > For arch_unmap(), I'm wondering how/what other architectures do, because
> > powerpc seems to be the only one to erase the vdso context pointer when
> > unmapping the vdso.
>
> Yeah. The original unmap/remap stuff was added for CRIU, which I thought
> people tested on other architectures (more than powerpc even).
>
> Possibly no one really cares about vdso unmap though, vs just moving the
> vdso.
>
> We added a test for vdso unmap recently because it happened to trigger a
> KAUP failure, and someone actually hit it & reported it.

You right, CRIU cares much more about moving vDSO.
It's done for each restoree and as on most setups vDSO is premapped and
used by the application - it's actively tested.
Speaking about vDSO unmap - that's concerning only for heterogeneous C/R,
i.e when an application is migrated from a system that uses vDSO to the one
which doesn't - it's much rare scenario.
(for arm it's !CONFIG_VDSO, for x86 it's `vdso=0` boot parameter)

Looking at the code, it seems quite easy to provide/maintain .close() for
vm_special_mapping. A bit harder to add a test from CRIU side
(as glibc won't know on restore that it can't use vdso anymore),
but totally not impossible.

> Running that test on arm64 segfaults:
>
>   # ./sigreturn_vdso
>   VDSO is at 0xffff8191f000-0xffff8191ffff (4096 bytes)
>   Signal delivered OK with VDSO mapped
>   VDSO moved to 0xffff8191a000-0xffff8191afff (4096 bytes)
>   Signal delivered OK with VDSO moved
>   Unmapped VDSO
>   Remapped the stack executable
>   [   48.556191] potentially unexpected fatal signal 11.
>   [   48.556752] CPU: 0 PID: 140 Comm: sigreturn_vdso Not tainted 5.9.0-rc2-00057-g2ac69819ba9e #190
>   [   48.556990] Hardware name: linux,dummy-virt (DT)
>   [   48.557336] pstate: 60001000 (nZCv daif -PAN -UAO BTYPE=--)
>   [   48.557475] pc : 0000ffff8191a7bc
>   [   48.557603] lr : 0000ffff8191a7bc
>   [   48.557697] sp : 0000ffffc13c9e90
>   [   48.557873] x29: 0000ffffc13cb0e0 x28: 0000000000000000
>   [   48.558201] x27: 0000000000000000 x26: 0000000000000000
>   [   48.558337] x25: 0000000000000000 x24: 0000000000000000
>   [   48.558754] x23: 0000000000000000 x22: 0000000000000000
>   [   48.558893] x21: 00000000004009b0 x20: 0000000000000000
>   [   48.559046] x19: 0000000000400ff0 x18: 0000000000000000
>   [   48.559180] x17: 0000ffff817da300 x16: 0000000000412010
>   [   48.559312] x15: 0000000000000000 x14: 000000000000001c
>   [   48.559443] x13: 656c626174756365 x12: 7865206b63617473
>   [   48.559625] x11: 0000000000000003 x10: 0101010101010101
>   [   48.559828] x9 : 0000ffff818afda8 x8 : 0000000000000081
>   [   48.559973] x7 : 6174732065687420 x6 : 64657070616d6552
>   [   48.560115] x5 : 000000000e0388bd x4 : 000000000040135d
>   [   48.560270] x3 : 0000000000000000 x2 : 0000000000000001
>   [   48.560412] x1 : 0000000000000003 x0 : 00000000004120b8
>   Segmentation fault
>   #
>
> So I think we need to keep the unmap hook. Maybe it should be handled by
> the special_mapping stuff generically.

I'll cook a patch for vm_special_mapping if you don't mind :-)

Thanks,
             Dmitry
Michael Ellerman Aug. 28, 2020, 2:14 a.m. UTC | #6
Dmitry Safonov <0x7f454c46@gmail.com> writes:
> Hello,
>
> On Wed, 26 Aug 2020 at 15:39, Michael Ellerman <mpe@ellerman.id.au> wrote:
>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> [..]
>> > arch_remap() gets replaced by vdso_remap()
>> >
>> > For arch_unmap(), I'm wondering how/what other architectures do, because
>> > powerpc seems to be the only one to erase the vdso context pointer when
>> > unmapping the vdso.
>>
>> Yeah. The original unmap/remap stuff was added for CRIU, which I thought
>> people tested on other architectures (more than powerpc even).
>>
>> Possibly no one really cares about vdso unmap though, vs just moving the
>> vdso.
>>
>> We added a test for vdso unmap recently because it happened to trigger a
>> KAUP failure, and someone actually hit it & reported it.
>
> You right, CRIU cares much more about moving vDSO.
> It's done for each restoree and as on most setups vDSO is premapped and
> used by the application - it's actively tested.
> Speaking about vDSO unmap - that's concerning only for heterogeneous C/R,
> i.e when an application is migrated from a system that uses vDSO to the one
> which doesn't - it's much rare scenario.
> (for arm it's !CONFIG_VDSO, for x86 it's `vdso=0` boot parameter)

Ah OK that explains it.

The case we hit of VDSO unmapping was some strange "library OS" thing
which had explicitly unmapped the VDSO, so also very rare.

> Looking at the code, it seems quite easy to provide/maintain .close() for
> vm_special_mapping. A bit harder to add a test from CRIU side
> (as glibc won't know on restore that it can't use vdso anymore),
> but totally not impossible.
>
>> Running that test on arm64 segfaults:
>>
>>   # ./sigreturn_vdso
>>   VDSO is at 0xffff8191f000-0xffff8191ffff (4096 bytes)
>>   Signal delivered OK with VDSO mapped
>>   VDSO moved to 0xffff8191a000-0xffff8191afff (4096 bytes)
>>   Signal delivered OK with VDSO moved
>>   Unmapped VDSO
>>   Remapped the stack executable
>>   [   48.556191] potentially unexpected fatal signal 11.
>>   [   48.556752] CPU: 0 PID: 140 Comm: sigreturn_vdso Not tainted 5.9.0-rc2-00057-g2ac69819ba9e #190
>>   [   48.556990] Hardware name: linux,dummy-virt (DT)
>>   [   48.557336] pstate: 60001000 (nZCv daif -PAN -UAO BTYPE=--)
>>   [   48.557475] pc : 0000ffff8191a7bc
>>   [   48.557603] lr : 0000ffff8191a7bc
>>   [   48.557697] sp : 0000ffffc13c9e90
>>   [   48.557873] x29: 0000ffffc13cb0e0 x28: 0000000000000000
>>   [   48.558201] x27: 0000000000000000 x26: 0000000000000000
>>   [   48.558337] x25: 0000000000000000 x24: 0000000000000000
>>   [   48.558754] x23: 0000000000000000 x22: 0000000000000000
>>   [   48.558893] x21: 00000000004009b0 x20: 0000000000000000
>>   [   48.559046] x19: 0000000000400ff0 x18: 0000000000000000
>>   [   48.559180] x17: 0000ffff817da300 x16: 0000000000412010
>>   [   48.559312] x15: 0000000000000000 x14: 000000000000001c
>>   [   48.559443] x13: 656c626174756365 x12: 7865206b63617473
>>   [   48.559625] x11: 0000000000000003 x10: 0101010101010101
>>   [   48.559828] x9 : 0000ffff818afda8 x8 : 0000000000000081
>>   [   48.559973] x7 : 6174732065687420 x6 : 64657070616d6552
>>   [   48.560115] x5 : 000000000e0388bd x4 : 000000000040135d
>>   [   48.560270] x3 : 0000000000000000 x2 : 0000000000000001
>>   [   48.560412] x1 : 0000000000000003 x0 : 00000000004120b8
>>   Segmentation fault
>>   #
>>
>> So I think we need to keep the unmap hook. Maybe it should be handled by
>> the special_mapping stuff generically.
>
> I'll cook a patch for vm_special_mapping if you don't mind :-)

That would be great, thanks!

cheers
Will Deacon Sept. 21, 2020, 11:26 a.m. UTC | #7
On Fri, Aug 28, 2020 at 12:14:28PM +1000, Michael Ellerman wrote:
> Dmitry Safonov <0x7f454c46@gmail.com> writes:
> > On Wed, 26 Aug 2020 at 15:39, Michael Ellerman <mpe@ellerman.id.au> wrote:
> >> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> >> We added a test for vdso unmap recently because it happened to trigger a
> >> KAUP failure, and someone actually hit it & reported it.
> >
> > You right, CRIU cares much more about moving vDSO.
> > It's done for each restoree and as on most setups vDSO is premapped and
> > used by the application - it's actively tested.
> > Speaking about vDSO unmap - that's concerning only for heterogeneous C/R,
> > i.e when an application is migrated from a system that uses vDSO to the one
> > which doesn't - it's much rare scenario.
> > (for arm it's !CONFIG_VDSO, for x86 it's `vdso=0` boot parameter)
> 
> Ah OK that explains it.
> 
> The case we hit of VDSO unmapping was some strange "library OS" thing
> which had explicitly unmapped the VDSO, so also very rare.
> 
> > Looking at the code, it seems quite easy to provide/maintain .close() for
> > vm_special_mapping. A bit harder to add a test from CRIU side
> > (as glibc won't know on restore that it can't use vdso anymore),
> > but totally not impossible.
> >
> >> Running that test on arm64 segfaults:
> >>
> >>   # ./sigreturn_vdso
> >>   VDSO is at 0xffff8191f000-0xffff8191ffff (4096 bytes)
> >>   Signal delivered OK with VDSO mapped
> >>   VDSO moved to 0xffff8191a000-0xffff8191afff (4096 bytes)
> >>   Signal delivered OK with VDSO moved
> >>   Unmapped VDSO
> >>   Remapped the stack executable
> >>   [   48.556191] potentially unexpected fatal signal 11.
> >>   [   48.556752] CPU: 0 PID: 140 Comm: sigreturn_vdso Not tainted 5.9.0-rc2-00057-g2ac69819ba9e #190
> >>   [   48.556990] Hardware name: linux,dummy-virt (DT)
> >>   [   48.557336] pstate: 60001000 (nZCv daif -PAN -UAO BTYPE=--)
> >>   [   48.557475] pc : 0000ffff8191a7bc
> >>   [   48.557603] lr : 0000ffff8191a7bc
> >>   [   48.557697] sp : 0000ffffc13c9e90
> >>   [   48.557873] x29: 0000ffffc13cb0e0 x28: 0000000000000000
> >>   [   48.558201] x27: 0000000000000000 x26: 0000000000000000
> >>   [   48.558337] x25: 0000000000000000 x24: 0000000000000000
> >>   [   48.558754] x23: 0000000000000000 x22: 0000000000000000
> >>   [   48.558893] x21: 00000000004009b0 x20: 0000000000000000
> >>   [   48.559046] x19: 0000000000400ff0 x18: 0000000000000000
> >>   [   48.559180] x17: 0000ffff817da300 x16: 0000000000412010
> >>   [   48.559312] x15: 0000000000000000 x14: 000000000000001c
> >>   [   48.559443] x13: 656c626174756365 x12: 7865206b63617473
> >>   [   48.559625] x11: 0000000000000003 x10: 0101010101010101
> >>   [   48.559828] x9 : 0000ffff818afda8 x8 : 0000000000000081
> >>   [   48.559973] x7 : 6174732065687420 x6 : 64657070616d6552
> >>   [   48.560115] x5 : 000000000e0388bd x4 : 000000000040135d
> >>   [   48.560270] x3 : 0000000000000000 x2 : 0000000000000001
> >>   [   48.560412] x1 : 0000000000000003 x0 : 00000000004120b8
> >>   Segmentation fault
> >>   #
> >>
> >> So I think we need to keep the unmap hook. Maybe it should be handled by
> >> the special_mapping stuff generically.
> >
> > I'll cook a patch for vm_special_mapping if you don't mind :-)
> 
> That would be great, thanks!

I lost track of this one. Is there a patch kicking around to resolve this,
or is the segfault expected behaviour?

Will
Christophe Leroy Sept. 27, 2020, 7:43 a.m. UTC | #8
Le 21/09/2020 à 13:26, Will Deacon a écrit :
> On Fri, Aug 28, 2020 at 12:14:28PM +1000, Michael Ellerman wrote:
>> Dmitry Safonov <0x7f454c46@gmail.com> writes:
>>> On Wed, 26 Aug 2020 at 15:39, Michael Ellerman <mpe@ellerman.id.au> wrote:
>>>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>>> We added a test for vdso unmap recently because it happened to trigger a
>>>> KAUP failure, and someone actually hit it & reported it.
>>>
>>> You right, CRIU cares much more about moving vDSO.
>>> It's done for each restoree and as on most setups vDSO is premapped and
>>> used by the application - it's actively tested.
>>> Speaking about vDSO unmap - that's concerning only for heterogeneous C/R,
>>> i.e when an application is migrated from a system that uses vDSO to the one
>>> which doesn't - it's much rare scenario.
>>> (for arm it's !CONFIG_VDSO, for x86 it's `vdso=0` boot parameter)
>>
>> Ah OK that explains it.
>>
>> The case we hit of VDSO unmapping was some strange "library OS" thing
>> which had explicitly unmapped the VDSO, so also very rare.
>>
>>> Looking at the code, it seems quite easy to provide/maintain .close() for
>>> vm_special_mapping. A bit harder to add a test from CRIU side
>>> (as glibc won't know on restore that it can't use vdso anymore),
>>> but totally not impossible.
>>>
>>>> Running that test on arm64 segfaults:
>>>>
>>>>    # ./sigreturn_vdso
>>>>    VDSO is at 0xffff8191f000-0xffff8191ffff (4096 bytes)
>>>>    Signal delivered OK with VDSO mapped
>>>>    VDSO moved to 0xffff8191a000-0xffff8191afff (4096 bytes)
>>>>    Signal delivered OK with VDSO moved
>>>>    Unmapped VDSO
>>>>    Remapped the stack executable
>>>>    [   48.556191] potentially unexpected fatal signal 11.
>>>>    [   48.556752] CPU: 0 PID: 140 Comm: sigreturn_vdso Not tainted 5.9.0-rc2-00057-g2ac69819ba9e #190
>>>>    [   48.556990] Hardware name: linux,dummy-virt (DT)
>>>>    [   48.557336] pstate: 60001000 (nZCv daif -PAN -UAO BTYPE=--)
>>>>    [   48.557475] pc : 0000ffff8191a7bc
>>>>    [   48.557603] lr : 0000ffff8191a7bc
>>>>    [   48.557697] sp : 0000ffffc13c9e90
>>>>    [   48.557873] x29: 0000ffffc13cb0e0 x28: 0000000000000000
>>>>    [   48.558201] x27: 0000000000000000 x26: 0000000000000000
>>>>    [   48.558337] x25: 0000000000000000 x24: 0000000000000000
>>>>    [   48.558754] x23: 0000000000000000 x22: 0000000000000000
>>>>    [   48.558893] x21: 00000000004009b0 x20: 0000000000000000
>>>>    [   48.559046] x19: 0000000000400ff0 x18: 0000000000000000
>>>>    [   48.559180] x17: 0000ffff817da300 x16: 0000000000412010
>>>>    [   48.559312] x15: 0000000000000000 x14: 000000000000001c
>>>>    [   48.559443] x13: 656c626174756365 x12: 7865206b63617473
>>>>    [   48.559625] x11: 0000000000000003 x10: 0101010101010101
>>>>    [   48.559828] x9 : 0000ffff818afda8 x8 : 0000000000000081
>>>>    [   48.559973] x7 : 6174732065687420 x6 : 64657070616d6552
>>>>    [   48.560115] x5 : 000000000e0388bd x4 : 000000000040135d
>>>>    [   48.560270] x3 : 0000000000000000 x2 : 0000000000000001
>>>>    [   48.560412] x1 : 0000000000000003 x0 : 00000000004120b8
>>>>    Segmentation fault
>>>>    #
>>>>
>>>> So I think we need to keep the unmap hook. Maybe it should be handled by
>>>> the special_mapping stuff generically.
>>>
>>> I'll cook a patch for vm_special_mapping if you don't mind :-)
>>
>> That would be great, thanks!
> 
> I lost track of this one. Is there a patch kicking around to resolve this,
> or is the segfault expected behaviour?
> 

IIUC dmitry said he will cook a patch. I have not seen any patch yet.

AFAIKS, among the architectures having VDSO sigreturn trampolines, only SH, X86 and POWERPC provide 
alternative trampoline on stack when VDSO is not there.

All other architectures just having a VDSO don't expect VDSO to not be mapped.

As far as nowadays stacks are mapped non-executable, getting a segfaut is expected behaviour. 
However, I think we should really make it cleaner. Today it segfaults because it is still pointing 
to the VDSO trampoline that has been unmapped. But should the user map some other code at the same 
address, we'll run in the weed on signal return instead of segfaulting.

So VDSO unmapping should really be properly managed, the reference should be properly cleared in 
order to segfault in a controllable manner.

Only powerpc has a hook to properly clear the VDSO pointer when VDSO is unmapped.

Christophe
Dmitry Safonov Sept. 28, 2020, 3:08 p.m. UTC | #9
On 9/27/20 8:43 AM, Christophe Leroy wrote:
> 
> 
> Le 21/09/2020 à 13:26, Will Deacon a écrit :
>> On Fri, Aug 28, 2020 at 12:14:28PM +1000, Michael Ellerman wrote:
>>> Dmitry Safonov <0x7f454c46@gmail.com> writes:
[..]
>>>> I'll cook a patch for vm_special_mapping if you don't mind :-)
>>>
>>> That would be great, thanks!
>>
>> I lost track of this one. Is there a patch kicking around to resolve
>> this,
>> or is the segfault expected behaviour?
>>
> 
> IIUC dmitry said he will cook a patch. I have not seen any patch yet.

Yes, sorry about the delay - I was a bit busy with xfrm patches.

I'll send patches for .close() this week, working on them now.

> AFAIKS, among the architectures having VDSO sigreturn trampolines, only
> SH, X86 and POWERPC provide alternative trampoline on stack when VDSO is
> not there.
> 
> All other architectures just having a VDSO don't expect VDSO to not be
> mapped.
> 
> As far as nowadays stacks are mapped non-executable, getting a segfaut
> is expected behaviour. However, I think we should really make it
> cleaner. Today it segfaults because it is still pointing to the VDSO
> trampoline that has been unmapped. But should the user map some other
> code at the same address, we'll run in the weed on signal return instead
> of segfaulting.

+1.

> So VDSO unmapping should really be properly managed, the reference
> should be properly cleared in order to segfault in a controllable manner.
> 
> Only powerpc has a hook to properly clear the VDSO pointer when VDSO is
> unmapped.

Thanks,
         Dmitry
Christophe Leroy Oct. 23, 2020, 11:22 a.m. UTC | #10
Hi Dmitry,

Le 28/09/2020 à 17:08, Dmitry Safonov a écrit :
> On 9/27/20 8:43 AM, Christophe Leroy wrote:
>>
>>
>> Le 21/09/2020 à 13:26, Will Deacon a écrit :
>>> On Fri, Aug 28, 2020 at 12:14:28PM +1000, Michael Ellerman wrote:
>>>> Dmitry Safonov <0x7f454c46@gmail.com> writes:
> [..]
>>>>> I'll cook a patch for vm_special_mapping if you don't mind :-)
>>>>
>>>> That would be great, thanks!
>>>
>>> I lost track of this one. Is there a patch kicking around to resolve
>>> this,
>>> or is the segfault expected behaviour?
>>>
>>
>> IIUC dmitry said he will cook a patch. I have not seen any patch yet.
> 
> Yes, sorry about the delay - I was a bit busy with xfrm patches.
> 
> I'll send patches for .close() this week, working on them now.

I haven't seen the patches, did you sent them out finally ?

Christophe
Will Deacon Oct. 23, 2020, 11:25 a.m. UTC | #11
On Fri, Oct 23, 2020 at 01:22:04PM +0200, Christophe Leroy wrote:
> Hi Dmitry,
> 
> Le 28/09/2020 à 17:08, Dmitry Safonov a écrit :
> > On 9/27/20 8:43 AM, Christophe Leroy wrote:
> > > 
> > > 
> > > Le 21/09/2020 à 13:26, Will Deacon a écrit :
> > > > On Fri, Aug 28, 2020 at 12:14:28PM +1000, Michael Ellerman wrote:
> > > > > Dmitry Safonov <0x7f454c46@gmail.com> writes:
> > [..]
> > > > > > I'll cook a patch for vm_special_mapping if you don't mind :-)
> > > > > 
> > > > > That would be great, thanks!
> > > > 
> > > > I lost track of this one. Is there a patch kicking around to resolve
> > > > this,
> > > > or is the segfault expected behaviour?
> > > > 
> > > 
> > > IIUC dmitry said he will cook a patch. I have not seen any patch yet.
> > 
> > Yes, sorry about the delay - I was a bit busy with xfrm patches.
> > 
> > I'll send patches for .close() this week, working on them now.
> 
> I haven't seen the patches, did you sent them out finally ?

I think it's this series:

https://lore.kernel.org/r/20201013013416.390574-1-dima@arista.com

but they look really invasive to me, so I may cook a small hack for arm64
in the meantine / for stable.

Will
Christophe Leroy Oct. 23, 2020, 11:57 a.m. UTC | #12
Le 23/10/2020 à 13:25, Will Deacon a écrit :
> On Fri, Oct 23, 2020 at 01:22:04PM +0200, Christophe Leroy wrote:
>> Hi Dmitry,
>>
>> Le 28/09/2020 à 17:08, Dmitry Safonov a écrit :
>>> On 9/27/20 8:43 AM, Christophe Leroy wrote:
>>>>
>>>>
>>>> Le 21/09/2020 à 13:26, Will Deacon a écrit :
>>>>> On Fri, Aug 28, 2020 at 12:14:28PM +1000, Michael Ellerman wrote:
>>>>>> Dmitry Safonov <0x7f454c46@gmail.com> writes:
>>> [..]
>>>>>>> I'll cook a patch for vm_special_mapping if you don't mind :-)
>>>>>>
>>>>>> That would be great, thanks!
>>>>>
>>>>> I lost track of this one. Is there a patch kicking around to resolve
>>>>> this,
>>>>> or is the segfault expected behaviour?
>>>>>
>>>>
>>>> IIUC dmitry said he will cook a patch. I have not seen any patch yet.
>>>
>>> Yes, sorry about the delay - I was a bit busy with xfrm patches.
>>>
>>> I'll send patches for .close() this week, working on them now.
>>
>> I haven't seen the patches, did you sent them out finally ?
> 
> I think it's this series:
> 
> https://lore.kernel.org/r/20201013013416.390574-1-dima@arista.com
> 
> but they look really invasive to me, so I may cook a small hack for arm64
> in the meantine / for stable.
> 

Not sure we are talking about the same thing.

I can't see any new .close function added to vm_special_mapping in order to replace arch_unmap() hook.

Christophe
Dmitry Safonov Oct. 23, 2020, 1:29 p.m. UTC | #13
Hi Christophe, Will,

On 10/23/20 12:57 PM, Christophe Leroy wrote:
> 
> 
> Le 23/10/2020 à 13:25, Will Deacon a écrit :
>> On Fri, Oct 23, 2020 at 01:22:04PM +0200, Christophe Leroy wrote:
>>> Hi Dmitry,
[..]
>>> I haven't seen the patches, did you sent them out finally ?

I was working on .close() hook, but while cooking it, I thought it may
be better to make tracking of user landing generic. Note that the vdso
base address is mostly needed by kernel as an address to land in
userspace after processing a signal.

I have some raw patches that add
+#ifdef CONFIG_ARCH_HAS_USER_LANDING
+               struct vm_area_struct *user_landing;
+#endif
inside mm_struct and I plan to finish them after rc1 gets released.

While working on that, I noticed that arm32 and some other architectures
track vdso position in mm.context with the only reason to add
AT_SYSINFO_EHDR in the elf header that's being loaded. That's quite
overkill to have a pointer in mm.context that rather can be a local
variable in elf binfmt loader. Also, I found some issues with mremap
code. The patches series mentioned are at the base of the branch with
generic user landing. I have sent only those patches not the full branch
as I remember there was a policy that during merge window one should
send only fixes, rather than refactoring/new code.

>> I think it's this series:
>>
>> https://lore.kernel.org/r/20201013013416.390574-1-dima@arista.com
>>
>> but they look really invasive to me, so I may cook a small hack for arm64
>> in the meantine / for stable.

I don't mind small hacks, but I'm concerned that the suggested fix which
sets `mm->context.vdso_base = 0` on munmap() may have it's issue: that
won't work if a user for whatever-broken-reason will mremap() vdso on 0
address. As the fix supposes to fix an issue that hasn't fired for
anyone yet, it probably shouldn't introduce another. That's why I've
used vm_area_struct to track vdso position in the patches set.
Probably, temporary, you could use something like:
#define BAD_VDSO_ADDRESS    (-1)UL
Or non-page-aligned address.
But the signal code that checks if it can land on vdso/sigpage should be
also aligned with the new definition.

> Not sure we are talking about the same thing.
> 
> I can't see any new .close function added to vm_special_mapping in order
> to replace arch_unmap() hook.
Thanks,
          Dmitry
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h
index b9ef6cf50ea5..11886467dfdf 100644
--- a/arch/powerpc/include/asm/vdso_datapage.h
+++ b/arch/powerpc/include/asm/vdso_datapage.h
@@ -118,10 +118,12 @@  extern struct vdso_data *vdso_data;
 
 .macro get_datapage ptr, tmp
 	bcl	20, 31, .+4
+999:
 	mflr	\ptr
-	addi	\ptr, \ptr, (__kernel_datapage_offset - (.-4))@l
-	lwz	\tmp, 0(\ptr)
-	add	\ptr, \tmp, \ptr
+#if CONFIG_PPC_PAGE_SHIFT > 14
+	addis	\ptr, \ptr, (_vdso_datapage - 999b)@ha
+#endif
+	addi	\ptr, \ptr, (_vdso_datapage - 999b)@l
 .endm
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index f38f26e844b6..d33fa22ddbed 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -190,7 +190,7 @@  int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	 * install_special_mapping or the perf counter mmap tracking code
 	 * will fail to recognise it as a vDSO (since arch_vma_name fails).
 	 */
-	current->mm->context.vdso_base = vdso_base;
+	current->mm->context.vdso_base = vdso_base + PAGE_SIZE;
 
 	/*
 	 * our vma flags don't have VM_WRITE so by default, the process isn't
@@ -482,42 +482,6 @@  static __init void vdso_setup_trampolines(struct lib32_elfinfo *v32,
 	vdso32_rt_sigtramp = find_function32(v32, "__kernel_sigtramp_rt32");
 }
 
-static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
-				       struct lib64_elfinfo *v64)
-{
-#ifdef CONFIG_VDSO32
-	Elf32_Sym *sym32;
-#endif
-#ifdef CONFIG_PPC64
-	Elf64_Sym *sym64;
-
-       	sym64 = find_symbol64(v64, "__kernel_datapage_offset");
-	if (sym64 == NULL) {
-		printk(KERN_ERR "vDSO64: Can't find symbol "
-		       "__kernel_datapage_offset !\n");
-		return -1;
-	}
-	*((int *)(vdso64_kbase + sym64->st_value - VDSO64_LBASE)) =
-		(vdso64_pages << PAGE_SHIFT) -
-		(sym64->st_value - VDSO64_LBASE);
-#endif /* CONFIG_PPC64 */
-
-#ifdef CONFIG_VDSO32
-	sym32 = find_symbol32(v32, "__kernel_datapage_offset");
-	if (sym32 == NULL) {
-		printk(KERN_ERR "vDSO32: Can't find symbol "
-		       "__kernel_datapage_offset !\n");
-		return -1;
-	}
-	*((int *)(vdso32_kbase + (sym32->st_value - VDSO32_LBASE))) =
-		(vdso32_pages << PAGE_SHIFT) -
-		(sym32->st_value - VDSO32_LBASE);
-#endif
-
-	return 0;
-}
-
-
 static __init int vdso_fixup_features(struct lib32_elfinfo *v32,
 				      struct lib64_elfinfo *v64)
 {
@@ -618,9 +582,6 @@  static __init int vdso_setup(void)
 	if (vdso_do_find_sections(&v32, &v64))
 		return -1;
 
-	if (vdso_fixup_datapage(&v32, &v64))
-		return -1;
-
 	if (vdso_fixup_features(&v32, &v64))
 		return -1;
 
@@ -761,26 +722,26 @@  static int __init vdso_init(void)
 	vdso32_pagelist = kcalloc(vdso32_pages + 2, sizeof(struct page *),
 				  GFP_KERNEL);
 	BUG_ON(vdso32_pagelist == NULL);
+	vdso32_pagelist[0] = virt_to_page(vdso_data);
 	for (i = 0; i < vdso32_pages; i++) {
 		struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE);
 		get_page(pg);
-		vdso32_pagelist[i] = pg;
+		vdso32_pagelist[i + 1] = pg;
 	}
-	vdso32_pagelist[i++] = virt_to_page(vdso_data);
-	vdso32_pagelist[i] = NULL;
+	vdso32_pagelist[i + 1] = NULL;
 #endif
 
 #ifdef CONFIG_PPC64
 	vdso64_pagelist = kcalloc(vdso64_pages + 2, sizeof(struct page *),
 				  GFP_KERNEL);
 	BUG_ON(vdso64_pagelist == NULL);
+	vdso64_pagelist[0] = virt_to_page(vdso_data);
 	for (i = 0; i < vdso64_pages; i++) {
 		struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
 		get_page(pg);
-		vdso64_pagelist[i] = pg;
+		vdso64_pagelist[i + 1] = pg;
 	}
-	vdso64_pagelist[i++] = virt_to_page(vdso_data);
-	vdso64_pagelist[i] = NULL;
+	vdso64_pagelist[i + 1] = NULL;
 #endif /* CONFIG_PPC64 */
 
 	get_page(virt_to_page(vdso_data));
diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S
index 217bb630f8f9..5513a4f8253e 100644
--- a/arch/powerpc/kernel/vdso32/datapage.S
+++ b/arch/powerpc/kernel/vdso32/datapage.S
@@ -13,9 +13,6 @@ 
 #include <asm/vdso_datapage.h>
 
 	.text
-	.global	__kernel_datapage_offset;
-__kernel_datapage_offset:
-	.long	0
 
 /*
  * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S
index 5206c2eb2a1d..6cf729612268 100644
--- a/arch/powerpc/kernel/vdso32/vdso32.lds.S
+++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S
@@ -4,6 +4,7 @@ 
  * library
  */
 #include <asm/vdso.h>
+#include <asm/page.h>
 
 #ifdef __LITTLE_ENDIAN__
 OUTPUT_FORMAT("elf32-powerpcle", "elf32-powerpcle", "elf32-powerpcle")
@@ -15,6 +16,7 @@  ENTRY(_start)
 
 SECTIONS
 {
+	PROVIDE(_vdso_datapage = . - PAGE_SIZE);
 	. = VDSO32_LBASE + SIZEOF_HEADERS;
 
 	.hash          	: { *(.hash) }			:text
@@ -138,11 +140,6 @@  VERSION
 {
 	VDSO_VERSION_STRING {
 	global:
-		/*
-		 * Has to be there for the kernel to find
-		 */
-		__kernel_datapage_offset;
-
 		__kernel_get_syscall_map;
 #ifndef CONFIG_PPC_BOOK3S_601
 		__kernel_gettimeofday;
diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S
index 067247d3efb9..03bb72c440dc 100644
--- a/arch/powerpc/kernel/vdso64/datapage.S
+++ b/arch/powerpc/kernel/vdso64/datapage.S
@@ -13,9 +13,6 @@ 
 #include <asm/vdso_datapage.h>
 
 	.text
-.global	__kernel_datapage_offset;
-__kernel_datapage_offset:
-	.long	0
 
 /*
  * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
diff --git a/arch/powerpc/kernel/vdso64/vdso64.lds.S b/arch/powerpc/kernel/vdso64/vdso64.lds.S
index 256fb9720298..f58c7e2e9cbd 100644
--- a/arch/powerpc/kernel/vdso64/vdso64.lds.S
+++ b/arch/powerpc/kernel/vdso64/vdso64.lds.S
@@ -4,6 +4,7 @@ 
  * library
  */
 #include <asm/vdso.h>
+#include <asm/page.h>
 
 #ifdef __LITTLE_ENDIAN__
 OUTPUT_FORMAT("elf64-powerpcle", "elf64-powerpcle", "elf64-powerpcle")
@@ -15,6 +16,7 @@  ENTRY(_start)
 
 SECTIONS
 {
+	PROVIDE(_vdso_datapage = . - PAGE_SIZE);
 	. = VDSO64_LBASE + SIZEOF_HEADERS;
 
 	.hash		: { *(.hash) }			:text
@@ -138,11 +140,6 @@  VERSION
 {
 	VDSO_VERSION_STRING {
 	global:
-		/*
-		 * Has to be there for the kernel to find
-		 */
-		__kernel_datapage_offset;
-
 		__kernel_get_syscall_map;
 		__kernel_gettimeofday;
 		__kernel_clock_gettime;