diff mbox series

[v5] um: Enable preemption in UML

Message ID 20230922065212.484385-1-anton.ivanov@cambridgegreys.com
State Superseded
Headers show
Series [v5] um: Enable preemption in UML | expand

Commit Message

Anton Ivanov Sept. 22, 2023, 6:52 a.m. UTC
From: Anton Ivanov <anton.ivanov@cambridgegreys.com>

Preemption requires saving/restoring FPU state. This patch
adds support for it using GCC intrinsics.

Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
---
 arch/um/Kconfig                         |  1 -
 arch/um/include/asm/fpu/api.h           |  9 ++-
 arch/um/include/asm/processor-generic.h |  3 +
 arch/um/kernel/Makefile                 |  4 ++
 arch/um/kernel/fpu.c                    | 77 +++++++++++++++++++++++++
 arch/um/kernel/irq.c                    |  2 +
 arch/um/kernel/tlb.c                    |  9 +++
 7 files changed, 101 insertions(+), 4 deletions(-)
 create mode 100644 arch/um/kernel/fpu.c

Comments

Richard Weinberger Sept. 22, 2023, 7:27 a.m. UTC | #1
----- Ursprüngliche Mail -----
> Von: "anton ivanov" <anton.ivanov@cambridgegreys.com>
> An: "linux-um" <linux-um@lists.infradead.org>
> CC: "Johannes Berg" <johannes@sipsolutions.net>, "richard" <richard@nod.at>, "anton ivanov"
> <anton.ivanov@cambridgegreys.com>
> Gesendet: Freitag, 22. September 2023 08:52:12
> Betreff: [PATCH v5] um: Enable preemption in UML

> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> 
> Preemption requires saving/restoring FPU state. This patch
> adds support for it using GCC intrinsics.

Can you please state in future patches what you have changed since the last one?
 
> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
> index 7d050ab0f78a..34ec8e677fb9 100644
> --- a/arch/um/kernel/tlb.c
> +++ b/arch/um/kernel/tlb.c
> @@ -362,6 +362,9 @@ static int flush_tlb_kernel_range_common(unsigned long
> start, unsigned long end)
> 
> 	mm = &init_mm;
> 	hvc = INIT_HVC(mm, force, userspace);
> +
> +    preempt_disable();
> +

Does this address the bootup splat I've reported yesterday?


Thanks,
//richard
Anton Ivanov Sept. 22, 2023, 7:29 a.m. UTC | #2
On 22/09/2023 08:27, Richard Weinberger wrote:
> ----- Ursprüngliche Mail -----
>> Von: "anton ivanov" <anton.ivanov@cambridgegreys.com>
>> An: "linux-um" <linux-um@lists.infradead.org>
>> CC: "Johannes Berg" <johannes@sipsolutions.net>, "richard" <richard@nod.at>, "anton ivanov"
>> <anton.ivanov@cambridgegreys.com>
>> Gesendet: Freitag, 22. September 2023 08:52:12
>> Betreff: [PATCH v5] um: Enable preemption in UML
>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>
>> Preemption requires saving/restoring FPU state. This patch
>> adds support for it using GCC intrinsics.
> Can you please state in future patches what you have changed since the last one?

Ack. Sorry.

This one adds guards around the critical sections in the TLB.

As a result I no longer see any barfs on startup in the configuration where I was observing them (nfs kernel server).

Both preempt and voluntary now boot without issues in that config.

>   
>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>> diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
>> index 7d050ab0f78a..34ec8e677fb9 100644
>> --- a/arch/um/kernel/tlb.c
>> +++ b/arch/um/kernel/tlb.c
>> @@ -362,6 +362,9 @@ static int flush_tlb_kernel_range_common(unsigned long
>> start, unsigned long end)
>>
>> 	mm = &init_mm;
>> 	hvc = INIT_HVC(mm, force, userspace);
>> +
>> +    preempt_disable();
>> +
> Does this address the bootup splat I've reported yesterday?
>
>
> Thanks,
> //richard
>
Johannes Berg Sept. 22, 2023, 7:30 a.m. UTC | #3
On Fri, 2023-09-22 at 07:52 +0100, anton.ivanov@cambridgegreys.com
wrote:
> 
> +++ b/arch/um/include/asm/processor-generic.h
> @@ -44,6 +44,9 @@ struct thread_struct {
>  			} cb;
>  		} u;
>  	} request;
> +#if defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
> +    u8 fpu[2048] __aligned(64); /* Intel docs require xsave/xrestore area to be aligned to 64 bytes */
> +#endif

Looks like you used spaces instead of tabs in a few places such as here.

> +#ifdef CONFIG_64BIT
> +	if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVEOPT)))
> +		__builtin_ia32_xsaveopt64(&current->thread.fpu, KNOWN_387_FEATURES);
> +	else {
> +		if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE)))
> +			__builtin_ia32_xsave64(&current->thread.fpu, KNOWN_387_FEATURES);
> +		else
> +			__builtin_ia32_fxsave64(&current->thread.fpu);
> +	}

Still think the else if chains would look better, but it also doesn't
matter much.

>  	mm = &init_mm;
>  	hvc = INIT_HVC(mm, force, userspace);
> +
> +    preempt_disable();


Also here spaces instead of tabs. Interesting you display tabs as 4
spaces when the kernel really does everything with tabs being 8 spaces
wide :)

But anyway that's all nitpicking, the real problem I found when running
this now was this:

BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:1519
in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 282, name: startup.sh
preempt_count: 2, expected: 0
no locks held by startup.sh/282.
irq event stamp: 0
hardirqs last  enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<0000000060044b82>] copy_process+0xa02/0x244e
softirqs last  enabled at (0): [<0000000060044b82>] copy_process+0xa02/0x244e
softirqs last disabled at (0): [<0000000000000000>] 0x0
CPU: 0 PID: 282 Comm: startup.sh Not tainted 6.6.0-rc1 #147
Stack:
 7229be60 60500273 00000002 6003cfc9
 606bd782 00000000 60b3e968 00000000
 7229bea0 60526312 00000081 00000000
Call Trace:
 [<6051cbaa>] ? _printk+0x0/0x94
 [<6002a5b4>] show_stack+0x13d/0x14c
 [<60500273>] ? dump_stack_print_info+0xde/0xed
 [<6003cfc9>] ? um_set_signals+0x0/0x3f
 [<60526312>] dump_stack_lvl+0x62/0x96
 [<6051cbaa>] ? _printk+0x0/0x94
 [<6052729b>] ? debug_lockdep_rcu_enabled+0x0/0x3b
 [<60526360>] dump_stack+0x1a/0x1c
 [<60073561>] __might_resched+0x2bb/0x2d9
 [<60073640>] __might_sleep+0xc1/0xcb
 [<6052bad8>] down_read+0x32/0x1c3
 [<6002c94e>] force_flush_all+0x74/0x105
 [<6002926e>] fork_handler+0x14/0x96


I had enabled CONFIG_DEBUG_ATOMIC_SLEEP because that's actually
something I'd really like to have in our testing.

But with that issue I don't even know how we get there really. It
doesn't even happen every time we fork?

I'll dig a little bit, but did you try enabling
CONFIG_DEBUG_ATOMIC_SLEEP also?

johannes
Anton Ivanov Sept. 22, 2023, 7:38 a.m. UTC | #4
On 22/09/2023 08:30, Johannes Berg wrote:
> On Fri, 2023-09-22 at 07:52 +0100, anton.ivanov@cambridgegreys.com
> wrote:
>> +++ b/arch/um/include/asm/processor-generic.h
>> @@ -44,6 +44,9 @@ struct thread_struct {
>>   			} cb;
>>   		} u;
>>   	} request;
>> +#if defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
>> +    u8 fpu[2048] __aligned(64); /* Intel docs require xsave/xrestore area to be aligned to 64 bytes */
>> +#endif
> Looks like you used spaces instead of tabs in a few places such as here.

Ack. I am not sure how they got there.

My environment is configured to use tabs when working on the kernel tree.

>
>> +#ifdef CONFIG_64BIT
>> +	if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVEOPT)))
>> +		__builtin_ia32_xsaveopt64(&current->thread.fpu, KNOWN_387_FEATURES);
>> +	else {
>> +		if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE)))
>> +			__builtin_ia32_xsave64(&current->thread.fpu, KNOWN_387_FEATURES);
>> +		else
>> +			__builtin_ia32_fxsave64(&current->thread.fpu);
>> +	}
> Still think the else if chains would look better, but it also doesn't
> matter much.
>
>>   	mm = &init_mm;
>>   	hvc = INIT_HVC(mm, force, userspace);
>> +
>> +    preempt_disable();
>
> Also here spaces instead of tabs. Interesting you display tabs as 4
> spaces when the kernel really does everything with tabs being 8 spaces
> wide :)
>
> But anyway that's all nitpicking, the real problem I found when running
> this now was this:
>
> BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:1519
> in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 282, name: startup.sh
> preempt_count: 2, expected: 0
> no locks held by startup.sh/282.
> irq event stamp: 0
> hardirqs last  enabled at (0): [<0000000000000000>] 0x0
> hardirqs last disabled at (0): [<0000000060044b82>] copy_process+0xa02/0x244e
> softirqs last  enabled at (0): [<0000000060044b82>] copy_process+0xa02/0x244e
> softirqs last disabled at (0): [<0000000000000000>] 0x0
> CPU: 0 PID: 282 Comm: startup.sh Not tainted 6.6.0-rc1 #147
> Stack:
>   7229be60 60500273 00000002 6003cfc9
>   606bd782 00000000 60b3e968 00000000
>   7229bea0 60526312 00000081 00000000
> Call Trace:
>   [<6051cbaa>] ? _printk+0x0/0x94
>   [<6002a5b4>] show_stack+0x13d/0x14c
>   [<60500273>] ? dump_stack_print_info+0xde/0xed
>   [<6003cfc9>] ? um_set_signals+0x0/0x3f
>   [<60526312>] dump_stack_lvl+0x62/0x96
>   [<6051cbaa>] ? _printk+0x0/0x94
>   [<6052729b>] ? debug_lockdep_rcu_enabled+0x0/0x3b
>   [<60526360>] dump_stack+0x1a/0x1c
>   [<60073561>] __might_resched+0x2bb/0x2d9
>   [<60073640>] __might_sleep+0xc1/0xcb
>   [<6052bad8>] down_read+0x32/0x1c3
>   [<6002c94e>] force_flush_all+0x74/0x105
TLB once again by the look of it.
>   [<6002926e>] fork_handler+0x14/0x96
>
>
> I had enabled CONFIG_DEBUG_ATOMIC_SLEEP because that's actually
> something I'd really like to have in our testing.
>
> But with that issue I don't even know how we get there really. It
> doesn't even happen every time we fork?
>
> I'll dig a little bit, but did you try enabling
> CONFIG_DEBUG_ATOMIC_SLEEP also?

Will do. I have no crashes over here so I need to trigger this one first.

Though, frankly, if it is a race in a tlb flush it may be subject to local conditions. So it will be difficult to reproduce.

>
> johannes
>
Johannes Berg Sept. 22, 2023, 7:40 a.m. UTC | #5
On Fri, 2023-09-22 at 08:38 +0100, Anton Ivanov wrote:
> > 
> > I had enabled CONFIG_DEBUG_ATOMIC_SLEEP because that's actually
> > something I'd really like to have in our testing.
> > 
> > But with that issue I don't even know how we get there really. It
> > doesn't even happen every time we fork?
> > 
> > I'll dig a little bit, but did you try enabling
> > CONFIG_DEBUG_ATOMIC_SLEEP also?
> 
> Will do. I have no crashes over here so I need to trigger this one first.
> 
> Though, frankly, if it is a race in a tlb flush it may be subject to local conditions. So it will be difficult to reproduce.
> 

Yes, it might be a race in the fork handler maybe? IOW, whenever we
actually do fork, it depends on what conditions the rest is in? I'm not
sure I fully figured out right now how the fork handler is doing all
this, but I sent my mail before I could since you were discussing :)

But it shouldn't be impossible - while I don't see it for every fork, it
does happen 8 or 9 times in a very simple test that just boots, pings a
bit, and then shuts down, so ...

johannes
Anton Ivanov Sept. 22, 2023, 8:01 a.m. UTC | #6
On 22/09/2023 08:40, Johannes Berg wrote:
> On Fri, 2023-09-22 at 08:38 +0100, Anton Ivanov wrote:
>>> I had enabled CONFIG_DEBUG_ATOMIC_SLEEP because that's actually
>>> something I'd really like to have in our testing.
>>>
>>> But with that issue I don't even know how we get there really. It
>>> doesn't even happen every time we fork?
>>>
>>> I'll dig a little bit, but did you try enabling
>>> CONFIG_DEBUG_ATOMIC_SLEEP also?
>> Will do. I have no crashes over here so I need to trigger this one first.
>>
>> Though, frankly, if it is a race in a tlb flush it may be subject to local conditions. So it will be difficult to reproduce.
>>
> Yes, it might be a race in the fork handler maybe? IOW, whenever we
> actually do fork, it depends on what conditions the rest is in? I'm not
> sure I fully figured out right now how the fork handler is doing all
> this, but I sent my mail before I could since you were discussing :)
>
> But it shouldn't be impossible - while I don't see it for every fork, it
> does happen 8 or 9 times in a very simple test that just boots, pings a
> bit, and then shuts down, so ...

My favorite test which I run on all changes is:

find /usr -type f -exec cat {} > /dev/null \;

On Debian this forks /bin/cat for each file and does IO on it. That test passes here every time :(

However, I did not have debug turned on. So, I would not be surprised if something in the debug

portion of the config makes the crashes more likely.

Otherwise, each fork is a forced tlb_flush_all, so it is a massive tlb exercise. Any bugs, locking, etc are likely to show up.

>
> johannes
>
Johannes Berg Sept. 22, 2023, 8:13 a.m. UTC | #7
On Fri, 2023-09-22 at 09:01 +0100, Anton Ivanov wrote:
> 
> My favorite test which I run on all changes is:
> 
> find /usr -type f -exec cat {} > /dev/null \;
> 
> On Debian this forks /bin/cat for each file and does IO on it. That test passes here every time :(

Oh, apart from the splat, it all _works_ fine.

> However, I did not have debug turned on. So, I would not be surprised if something in the debug

OK.

> portion of the config makes the crashes more likely.

It's just a debug warning for me, no real crash.

> Otherwise, each fork is a forced tlb_flush_all, so it is a massive tlb exercise. Any bugs, locking, etc are likely to show up.
> 

Yes, but when does the fork actually happen?

That part confuses me - the fork_handler() seems to be called at various
kind of random points in time after?

johannes
Anton Ivanov Sept. 22, 2023, 8:40 a.m. UTC | #8
On 22/09/2023 09:13, Johannes Berg wrote:
> On Fri, 2023-09-22 at 09:01 +0100, Anton Ivanov wrote:
>> My favorite test which I run on all changes is:
>>
>> find /usr -type f -exec cat {} > /dev/null \;
>>
>> On Debian this forks /bin/cat for each file and does IO on it. That test passes here every time :(
> Oh, apart from the splat, it all _works_ fine.

The splat is because mm is caused by this:

void force_flush_all(void)
{
     struct mm_struct *mm = current->mm;
     struct vm_area_struct *vma;
     VMA_ITERATOR(vmi, mm, 0);

     mmap_read_lock(mm);

     ^^^^^^^^^^^^^^

     for_each_vma(vmi, vma)
         fix_range(mm, vma->vm_start, vma->vm_end, 1);
     mmap_read_unlock(mm);

     ^^^^^^^^^^^^^^

}

We invoke this on every fork. MM is now locked "gently" using a semaphore and that is supposed to

be sleepable. From there on you get a warning if atomic sleep is enabled.


>
>> However, I did not have debug turned on. So, I would not be surprised if something in the debug
> OK.
>
>> portion of the config makes the crashes more likely.
> It's just a debug warning for me, no real crash.
>
>> Otherwise, each fork is a forced tlb_flush_all, so it is a massive tlb exercise. Any bugs, locking, etc are likely to show up.
>>
> Yes, but when does the fork actually happen?
>
> That part confuses me - the fork_handler() seems to be called at various
> kind of random points in time after?
Indeed. I am trying to chase it down. It seems harmless at this point, but it would be nice to understand what causes it.
>
> johannes
>
Johannes Berg Sept. 22, 2023, 8:41 a.m. UTC | #9
> 
> Yes, but when does the fork actually happen?
> 

Looking further at this, now I'm confused as to why it doesn't happen
_all_ the time.

I think this has pretty much always been wrong, just now we actually
notice it?

Basically, when we create a new thread (really just mm I think), we say
the first thing that has to run there is fork_handler(), which
initialises things the first time around. This calls force_flush_all()

But of course it's called from __schedule(), which has
preemption/interrupts disabled. So you can't do mmap_read_lock()?

But I'm confused as to why it doesn't seem happen all the time?

johannes
Johannes Berg Sept. 22, 2023, 8:43 a.m. UTC | #10
On Fri, 2023-09-22 at 10:41 +0200, Johannes Berg wrote:
> > 
> > Yes, but when does the fork actually happen?
> > 
> 
> Looking further at this, now I'm confused as to why it doesn't happen
> _all_ the time.
> 
> I think this has pretty much always been wrong, just now we actually
> notice it?
> 
> Basically, when we create a new thread (really just mm I think), we say
> the first thing that has to run there is fork_handler(), which
> initialises things the first time around. This calls force_flush_all()
> 
> But of course it's called from __schedule(), which has
> preemption/interrupts disabled. So you can't do mmap_read_lock()?
> 
> But I'm confused as to why it doesn't seem happen all the time?
> 

Haha, no, I'm an idiot - should've checked earlier. __might_resched()
has a ratelimit on this print ... :)

johannes
Johannes Berg Sept. 22, 2023, 9:04 a.m. UTC | #11
On Fri, 2023-09-22 at 10:43 +0200, Johannes Berg wrote:
> > 
> > I think this has pretty much always been wrong, just now we actually
> > notice it?
> > 
> > Basically, when we create a new thread (really just mm I think), we say
> > the first thing that has to run there is fork_handler(), which
> > initialises things the first time around. This calls force_flush_all()
> > 

So I thought we could perhaps just force_flush_all() the new mm in
init_new_context(), but that segfaults userspace immediately.

So I need to fully understand first (again?) why we even need
force_flush_all() at this spot, but probably not today.

johannes
Anton Ivanov Sept. 22, 2023, 9:06 a.m. UTC | #12
On 22/09/2023 09:41, Johannes Berg wrote:
>> Yes, but when does the fork actually happen?
>>
> Looking further at this, now I'm confused as to why it doesn't happen
> _all_ the time.
>
> I think this has pretty much always been wrong, just now we actually
> notice it?
>
> Basically, when we create a new thread (really just mm I think), we say
> the first thing that has to run there is fork_handler(), which
> initialises things the first time around. This calls force_flush_all()
>
> But of course it's called from __schedule(), which has
> preemption/interrupts disabled. So you can't do mmap_read_lock()?

Stupid question.

If we have preemption and interrupts disabled and we are UP do we really need to lock it at this point?

>
> But I'm confused as to why it doesn't seem happen all the time?
>
> johannes
>
> _______________________________________________
> linux-um mailing list
> linux-um@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-um
>
Johannes Berg Sept. 22, 2023, 9:10 a.m. UTC | #13
On Fri, 2023-09-22 at 10:06 +0100, Anton Ivanov wrote:
> On 22/09/2023 09:41, Johannes Berg wrote:
> > > Yes, but when does the fork actually happen?
> > > 
> > Looking further at this, now I'm confused as to why it doesn't happen
> > _all_ the time.
> > 
> > I think this has pretty much always been wrong, just now we actually
> > notice it?
> > 
> > Basically, when we create a new thread (really just mm I think), we say
> > the first thing that has to run there is fork_handler(), which
> > initialises things the first time around. This calls force_flush_all()
> > 
> > But of course it's called from __schedule(), which has
> > preemption/interrupts disabled. So you can't do mmap_read_lock()?
> 
> Stupid question.
> 
> If we have preemption and interrupts disabled and we are UP do we really need to lock it at this point?

Heh.

Functionally, I guess not. But lockdep gets really unhappy about stuff
in there:

=============================
WARNING: suspicious RCU usage
6.6.0-rc1 #159 Not tainted
-----------------------------
lib/maple_tree.c:840 suspicious rcu_dereference_check() usage!

other info that might help us debug this:


rcu_scheduler_active = 2, debug_locks = 1
no locks held by startup.sh/282.

stack backtrace:
CPU: 0 PID: 282 Comm: startup.sh Not tainted 6.6.0-rc1 #159
Stack:
 71a9bdd0 605001f3 00000000 6003cf40
 606bd782 00000000 60b3e968 00000000
 71a9be10 60526292 00000082 6051cb2a
Call Trace:
 [<6051cb2a>] ? _printk+0x0/0x94
 [<6002a5b4>] show_stack+0x13d/0x14c
 [<605001f3>] ? dump_stack_print_info+0xde/0xed
 [<6003cf40>] ? um_set_signals+0x0/0x3f
 [<60526292>] dump_stack_lvl+0x62/0x96
 [<6051cb2a>] ? _printk+0x0/0x94
 [<605262e0>] dump_stack+0x1a/0x1c
 [<6008865e>] lockdep_rcu_suspicious+0x10d/0x11c
 [<6052721b>] ? debug_lockdep_rcu_enabled+0x0/0x3b
 [<6052721b>] ? debug_lockdep_rcu_enabled+0x0/0x3b
 [<605085cd>] mas_root+0x86/0x91
 [<60508547>] ? mas_root+0x0/0x91
 [<60508611>] mas_start+0x39/0x9c
 [<60508875>] ? mas_state_walk+0x0/0x3e
 [<6050888d>] mas_state_walk+0x18/0x3e
 [<605088f1>] mas_walk+0x3e/0x96
 [<60508a03>] mas_find_setup+0xba/0xcd
 [<60509d49>] ? mas_find+0x0/0x4a
 [<60509d71>] mas_find+0x28/0x4a
 [<6002c92b>] force_flush_all+0x51/0x7c
 [<6002926e>] fork_handler+0x14/0x96


johannes
Anton Ivanov Sept. 22, 2023, 9:12 a.m. UTC | #14
On 22/09/2023 10:04, Johannes Berg wrote:
> On Fri, 2023-09-22 at 10:43 +0200, Johannes Berg wrote:
>>>
>>> I think this has pretty much always been wrong, just now we actually
>>> notice it?
>>>
>>> Basically, when we create a new thread (really just mm I think), we say
>>> the first thing that has to run there is fork_handler(), which
>>> initialises things the first time around. This calls force_flush_all()
>>>
> 
> So I thought we could perhaps just force_flush_all() the new mm in
> init_new_context(), but that segfaults userspace immediately.
>

Been there done that.

> So I need to fully understand first (again?) why we even need
> force_flush_all() at this spot, but probably not today.
No idea, but it does not seem to work without it. This is actually
the biggest performance bugbear in UML. If this is fixed, the 
performance will go reasonably close to native.

> 
> johannes
> 
> _______________________________________________
> linux-um mailing list
> linux-um@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-um
>
Johannes Berg Sept. 22, 2023, 9:14 a.m. UTC | #15
On Fri, 2023-09-22 at 11:10 +0200, Johannes Berg wrote:
> On Fri, 2023-09-22 at 10:06 +0100, Anton Ivanov wrote:
> > On 22/09/2023 09:41, Johannes Berg wrote:
> > > > Yes, but when does the fork actually happen?
> > > > 
> > > Looking further at this, now I'm confused as to why it doesn't happen
> > > _all_ the time.
> > > 
> > > I think this has pretty much always been wrong, just now we actually
> > > notice it?
> > > 
> > > Basically, when we create a new thread (really just mm I think), we say
> > > the first thing that has to run there is fork_handler(), which
> > > initialises things the first time around. This calls force_flush_all()
> > > 
> > > But of course it's called from __schedule(), which has
> > > preemption/interrupts disabled. So you can't do mmap_read_lock()?
> > 
> > Stupid question.
> > 
> > If we have preemption and interrupts disabled and we are UP do we really need to lock it at this point?
> 
> Heh.
> 

Ah, but this works:

--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -606,8 +606,8 @@ void force_flush_all(void)
 	struct vm_area_struct *vma;
 	VMA_ITERATOR(vmi, mm, 0);
 
-	mmap_read_lock(mm);
+	rcu_read_lock();
 	for_each_vma(vmi, vma)
 		fix_range(mm, vma->vm_start, vma->vm_end, 1);
-	mmap_read_unlock(mm);
+	rcu_read_unlock();
 }


it seems?

And it doesn't even seem _bad_ since as you say nothing can change, and
we only inject stuff into the process in fix_range() anyway.

So maybe that works - perhaps with a big comment?

johannes
Anton Ivanov Sept. 22, 2023, 9:19 a.m. UTC | #16
On 22/09/2023 10:14, Johannes Berg wrote:
> On Fri, 2023-09-22 at 11:10 +0200, Johannes Berg wrote:
>> On Fri, 2023-09-22 at 10:06 +0100, Anton Ivanov wrote:
>>> On 22/09/2023 09:41, Johannes Berg wrote:
>>>>> Yes, but when does the fork actually happen?
>>>>>
>>>> Looking further at this, now I'm confused as to why it doesn't happen
>>>> _all_ the time.
>>>>
>>>> I think this has pretty much always been wrong, just now we actually
>>>> notice it?
>>>>
>>>> Basically, when we create a new thread (really just mm I think), we say
>>>> the first thing that has to run there is fork_handler(), which
>>>> initialises things the first time around. This calls force_flush_all()
>>>>
>>>> But of course it's called from __schedule(), which has
>>>> preemption/interrupts disabled. So you can't do mmap_read_lock()?
>>>
>>> Stupid question.
>>>
>>> If we have preemption and interrupts disabled and we are UP do we really need to lock it at this point?
>>
>> Heh.
>>
> 
> Ah, but this works:
> 
> --- a/arch/um/kernel/tlb.c
> +++ b/arch/um/kernel/tlb.c
> @@ -606,8 +606,8 @@ void force_flush_all(void)
>   	struct vm_area_struct *vma;
>   	VMA_ITERATOR(vmi, mm, 0);
>   
> -	mmap_read_lock(mm);
> +	rcu_read_lock();
>   	for_each_vma(vmi, vma)
>   		fix_range(mm, vma->vm_start, vma->vm_end, 1);
> -	mmap_read_unlock(mm);
> +	rcu_read_unlock();
>   }
> 
> 
> it seems?
> 
> And it doesn't even seem _bad_ since as you say nothing can change, and
> we only inject stuff into the process in fix_range() anyway.
> 
> So maybe that works - perhaps with a big comment?

Ack. Will add this to the patch and run it through its paces.

> 
> johannes
>
Johannes Berg Sept. 22, 2023, 9:51 a.m. UTC | #17
On Fri, 2023-09-22 at 10:19 +0100, Anton Ivanov wrote:
> > 
> > So maybe that works - perhaps with a big comment?
> 
> Ack. Will add this to the patch and run it through its paces.
> 

We can also just get rid of it entirely:

diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h
index 68e2eb9cfb47..23dcc914d44e 100644
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -13,8 +13,6 @@
 #include <asm/mm_hooks.h>
 #include <asm/mmu.h>
 
-extern void force_flush_all(void);
-
 #define activate_mm activate_mm
 static inline void activate_mm(struct mm_struct *old, struct mm_struct *new)
 {
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 6daffb9d8a8d..a024acd6d85c 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -25,7 +25,6 @@
 #include <linux/threads.h>
 #include <linux/resume_user_mode.h>
 #include <asm/current.h>
-#include <asm/mmu_context.h>
 #include <linux/uaccess.h>
 #include <as-layout.h>
 #include <kern_util.h>
@@ -139,8 +138,6 @@ void new_thread_handler(void)
 /* Called magically, see new_thread_handler above */
 void fork_handler(void)
 {
-	force_flush_all();
-
 	schedule_tail(current->thread.prev_sched);
 
 	/*
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 656fe16c9b63..f3766dbbc4ee 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -30,10 +30,7 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm)
 		from_mm = &current->mm->context;
 
 	block_signals_trace();
-	if (from_mm)
-		to_mm->id.u.pid = copy_context_skas0(stack,
-						     from_mm->id.u.pid);
-	else to_mm->id.u.pid = start_userspace(stack);
+	to_mm->id.u.pid = start_userspace(stack);
 	unblock_signals_trace();
 
 	if (to_mm->id.u.pid < 0) {
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index 34ec8e677fb9..ce7fb5a34f0f 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -599,15 +599,3 @@ void flush_tlb_mm(struct mm_struct *mm)
 	for_each_vma(vmi, vma)
 		fix_range(mm, vma->vm_start, vma->vm_end, 0);
 }
-
-void force_flush_all(void)
-{
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	VMA_ITERATOR(vmi, mm, 0);
-
-	mmap_read_lock(mm);
-	for_each_vma(vmi, vma)
-		fix_range(mm, vma->vm_start, vma->vm_end, 1);
-	mmap_read_unlock(mm);
-}




I think that _might_ be slower for fork() happy workloads, since we
don't init from skas0, but I'm not even sure what's _in_ skas0 at that
point? Does it matter?

johannes
Johannes Berg Sept. 22, 2023, 9:55 a.m. UTC | #18
On Fri, 2023-09-22 at 11:51 +0200, Johannes Berg wrote:
> On Fri, 2023-09-22 at 10:19 +0100, Anton Ivanov wrote:
> > > 
> > > So maybe that works - perhaps with a big comment?
> > 
> > Ack. Will add this to the patch and run it through its paces.
> > 
> 
> We can also just get rid of it entirely:
> 

And then we can remove all the skas0 stuff, including init_thread_regs
and all.

Seems the cost would be basically that we now pagefault everything
that's used between fork() and exec()? But is that so much and so
important?

johannes
Anton Ivanov Sept. 22, 2023, 10:55 a.m. UTC | #19
On 22/09/2023 10:55, Johannes Berg wrote:
> On Fri, 2023-09-22 at 11:51 +0200, Johannes Berg wrote:
>> On Fri, 2023-09-22 at 10:19 +0100, Anton Ivanov wrote:
>>>> So maybe that works - perhaps with a big comment?
>>> Ack. Will add this to the patch and run it through its paces.
>>>
>> We can also just get rid of it entirely:
>>
> And then we can remove all the skas0 stuff, including init_thread_regs
> and all.
>
> Seems the cost would be basically that we now pagefault everything
> that's used between fork() and exec()? But is that so much and so
> important?

Needs testing :)

Let's get the PREEMPT out of the door first.

>
> johannes
>
Johannes Berg Sept. 22, 2023, 11:17 a.m. UTC | #20
On Fri, 2023-09-22 at 11:55 +0100, Anton Ivanov wrote:
> On 22/09/2023 10:55, Johannes Berg wrote:
> > On Fri, 2023-09-22 at 11:51 +0200, Johannes Berg wrote:
> > > On Fri, 2023-09-22 at 10:19 +0100, Anton Ivanov wrote:
> > > > > So maybe that works - perhaps with a big comment?
> > > > Ack. Will add this to the patch and run it through its paces.
> > > > 
> > > We can also just get rid of it entirely:
> > > 
> > And then we can remove all the skas0 stuff, including init_thread_regs
> > and all.
> > 
> > Seems the cost would be basically that we now pagefault everything
> > that's used between fork() and exec()? But is that so much and so
> > important?
> 
> Needs testing :)

Yes. I sent a patch just now, works for me.

> Let's get the PREEMPT out of the door first.

Dunno. It has a conflict, but I think it's kind of a bugfix, the whole
mm creation copying from 'current' seems fishy when you consider
clone(CLONE_VM).

johannes
diff mbox series

Patch

diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index b5e179360534..603f5fd82293 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -11,7 +11,6 @@  config UML
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_STRNCPY_FROM_USER
 	select ARCH_HAS_STRNLEN_USER
-	select ARCH_NO_PREEMPT
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_KASAN if X86_64
 	select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN
diff --git a/arch/um/include/asm/fpu/api.h b/arch/um/include/asm/fpu/api.h
index 71bfd9ef3938..9e7680bf48f0 100644
--- a/arch/um/include/asm/fpu/api.h
+++ b/arch/um/include/asm/fpu/api.h
@@ -4,12 +4,15 @@ 
 
 /* Copyright (c) 2020 Cambridge Greys Ltd
  * Copyright (c) 2020 Red Hat Inc.
- * A set of "dummy" defines to allow the direct inclusion
- * of x86 optimized copy, xor, etc routines into the
- * UML code tree. */
+ */
 
+#if defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
+extern void kernel_fpu_begin(void);
+extern void kernel_fpu_end(void);
+#else
 #define kernel_fpu_begin() (void)0
 #define kernel_fpu_end() (void)0
+#endif
 
 static inline bool irq_fpu_usable(void)
 {
diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h
index 7414154b8e9a..c08e8c5b0249 100644
--- a/arch/um/include/asm/processor-generic.h
+++ b/arch/um/include/asm/processor-generic.h
@@ -44,6 +44,9 @@  struct thread_struct {
 			} cb;
 		} u;
 	} request;
+#if defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
+    u8 fpu[2048] __aligned(64); /* Intel docs require xsave/xrestore area to be aligned to 64 bytes */
+#endif
 };
 
 #define INIT_THREAD \
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index 811188be954c..c616e884a488 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -26,9 +26,13 @@  obj-$(CONFIG_OF) += dtb.o
 obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-$(CONFIG_GENERIC_PCI_IOMAP) += ioport.o
+obj-$(CONFIG_PREEMPT) += fpu.o
+obj-$(CONFIG_PREEMPT_VOLUNTARY) += fpu.o
 
 USER_OBJS := config.o
 
+CFLAGS_fpu.o += -mxsave -mxsaveopt
+
 include $(srctree)/arch/um/scripts/Makefile.rules
 
 targets := config.c config.tmp capflags.c
diff --git a/arch/um/kernel/fpu.c b/arch/um/kernel/fpu.c
new file mode 100644
index 000000000000..346c07236185
--- /dev/null
+++ b/arch/um/kernel/fpu.c
@@ -0,0 +1,77 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 Cambridge Greys Ltd
+ * Copyright (C) 2023 Red Hat Inc
+ */
+
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <asm/fpu/api.h>
+#include <asm/cpufeature.h>
+
+/*
+ * The critical section between kernel_fpu_begin() and kernel_fpu_end()
+ * is non-reentrant. It is the caller's responsibility to avoid reentrance.
+ */
+
+static DEFINE_PER_CPU(bool, in_kernel_fpu);
+
+/* UML and driver code it pulls out of the x86 tree knows about 387 features
+ * up to and including AVX512. TILE, etc are not yet supported.
+ */
+
+#define KNOWN_387_FEATURES 0xFF
+
+
+void kernel_fpu_begin(void)
+{
+	preempt_disable();
+
+	WARN_ON(this_cpu_read(in_kernel_fpu));
+
+	this_cpu_write(in_kernel_fpu, true);
+
+#ifdef CONFIG_64BIT
+	if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVEOPT)))
+		__builtin_ia32_xsaveopt64(&current->thread.fpu, KNOWN_387_FEATURES);
+	else {
+		if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE)))
+			__builtin_ia32_xsave64(&current->thread.fpu, KNOWN_387_FEATURES);
+		else
+			__builtin_ia32_fxsave64(&current->thread.fpu);
+	}
+#else
+	if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVEOPT)))
+		__builtin_ia32_xsaveopt(&current->thread.fpu, KNOWN_387_FEATURES);
+	else {
+		if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE)))
+			__builtin_ia32_xsave(&current->thread.fpu, KNOWN_387_FEATURES);
+		else
+			__builtin_ia32_fxsave(&current->thread.fpu);
+	}
+#endif
+}
+
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+void kernel_fpu_end(void)
+{
+	WARN_ON(!this_cpu_read(in_kernel_fpu));
+
+#ifdef CONFIG_64BIT
+	if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE)))
+		__builtin_ia32_xrstor64(&current->thread.fpu, KNOWN_387_FEATURES);
+	else
+		__builtin_ia32_fxrstor64(&current->thread.fpu);
+#else
+	if (likely(cpu_has(&boot_cpu_data, X86_FEATURE_XSAVE)))
+		__builtin_ia32_xrstor(&current->thread.fpu, KNOWN_387_FEATURES);
+	else
+		__builtin_ia32_fxrstor(&current->thread.fpu);
+#endif
+	this_cpu_write(in_kernel_fpu, false);
+
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_end);
+
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 635d44606bfe..c02525da45df 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -195,7 +195,9 @@  static void _sigio_handler(struct uml_pt_regs *regs,
 
 void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
 {
+	preempt_disable();
 	_sigio_handler(regs, irqs_suspended);
+	preempt_enable();
 }
 
 static struct irq_entry *get_irq_entry_by_fd(int fd)
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index 7d050ab0f78a..34ec8e677fb9 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -362,6 +362,9 @@  static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end)
 
 	mm = &init_mm;
 	hvc = INIT_HVC(mm, force, userspace);
+
+    preempt_disable();
+
 	for (addr = start; addr < end;) {
 		pgd = pgd_offset(mm, addr);
 		if (!pgd_present(*pgd)) {
@@ -449,6 +452,9 @@  static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end)
 
 	if (err < 0)
 		panic("flush_tlb_kernel failed, errno = %d\n", err);
+
+    preempt_enable();
+
 	return updated;
 }
 
@@ -466,6 +472,8 @@  void flush_tlb_page(struct vm_area_struct *vma, unsigned long address)
 
 	address &= PAGE_MASK;
 
+    preempt_disable();
+
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
 		goto kill;
@@ -520,6 +528,7 @@  void flush_tlb_page(struct vm_area_struct *vma, unsigned long address)
 
 	*pte = pte_mkuptodate(*pte);
 
+    preempt_enable();
 	return;
 
 kill: