diff mbox series

[2/4] arch/x86: implement the process_vm_exec syscall

Message ID 20210414055217.543246-3-avagin@gmail.com
State Not Applicable
Headers show
Series Allow executing code and syscalls in another address space | expand

Commit Message

Andrei Vagin April 14, 2021, 5:52 a.m. UTC
This change introduces the new system call:
process_vm_exec(pid_t pid, struct sigcontext *uctx, unsigned long flags,
		siginfo_t * uinfo, sigset_t *sigmask, size_t sizemask)

process_vm_exec allows to execute the current process in an address
space of another process.

process_vm_exec swaps the current address space with an address space of
a specified process, sets a state from sigcontex and resumes the process.
When a process receives a signal or calls a system call,
process_vm_exec saves the process state back to sigcontext, restores the
origin address space, restores the origin process state, and returns to
userspace.

If it was interrupted by a signal and the signal is in the user_mask,
the signal is dequeued and information about it is saved in uinfo.
If process_vm_exec is interrupted by a system call, a synthetic siginfo
for the SIGSYS signal is generated.

The behavior of this system call is similar to PTRACE_SYSEMU but
everything is happing in the context of one process, so
process_vm_exec shows a better performance.

PTRACE_SYSEMU is primarily used to implement sandboxes (application
kernels) like User-mode Linux or gVisor. These type of sandboxes
intercepts applications system calls and acts as the guest kernel.
A simple benchmark, where a "tracee" process executes systems calls in a
loop and a "tracer" process traps syscalls and handles them just
incrementing the tracee instruction pointer to skip the syscall
instruction shows that process_vm_exec works more than 5 times faster
than PTRACE_SYSEMU.

Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
 arch/Kconfig                           |  15 +++
 arch/x86/Kconfig                       |   1 +
 arch/x86/entry/common.c                |  16 +++
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 arch/x86/include/asm/sigcontext.h      |   2 +
 arch/x86/kernel/Makefile               |   1 +
 arch/x86/kernel/process_vm_exec.c      | 133 +++++++++++++++++++++++++
 arch/x86/kernel/signal.c               |  47 +++++++++
 include/linux/process_vm_exec.h        |  15 +++
 include/linux/sched.h                  |   7 ++
 include/linux/syscalls.h               |   6 ++
 include/uapi/asm-generic/unistd.h      |   4 +-
 kernel/fork.c                          |   9 ++
 kernel/sys_ni.c                        |   2 +
 14 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/process_vm_exec.c
 create mode 100644 include/linux/process_vm_exec.h

Comments

Oleg Nesterov April 14, 2021, 5:09 p.m. UTC | #1
On 04/13, Andrei Vagin wrote:
>
> +static void swap_mm(struct mm_struct *prev_mm, struct mm_struct *target_mm)
> +{
> +	struct task_struct *tsk = current;
> +	struct mm_struct *active_mm;
> +
> +	task_lock(tsk);
> +	/* Hold off tlb flush IPIs while switching mm's */
> +	local_irq_disable();
> +
> +	sync_mm_rss(prev_mm);
> +
> +	vmacache_flush(tsk);
> +
> +	active_mm = tsk->active_mm;
> +	if (active_mm != target_mm) {
> +		mmgrab(target_mm);
> +		tsk->active_mm = target_mm;
> +	}
> +	tsk->mm = target_mm;
> +	switch_mm_irqs_off(active_mm, target_mm, tsk);
> +	local_irq_enable();
> +	task_unlock(tsk);
> +#ifdef finish_arch_post_lock_switch
> +	finish_arch_post_lock_switch();
> +#endif
> +
> +	if (active_mm != target_mm)
> +		mmdrop(active_mm);
> +}

I think this should be unified with kthread_use_mm() somehow...

And does it really need the "prev_mm" argument? It must be tsk->mm, no?

Oleg.
Andrei Vagin April 23, 2021, 6:59 a.m. UTC | #2
On Wed, Apr 14, 2021 at 07:09:15PM +0200, Oleg Nesterov wrote:
> On 04/13, Andrei Vagin wrote:
> >
> > +static void swap_mm(struct mm_struct *prev_mm, struct mm_struct *target_mm)
> > +{
> > +	struct task_struct *tsk = current;
> > +	struct mm_struct *active_mm;
> > +
> > +	task_lock(tsk);
> > +	/* Hold off tlb flush IPIs while switching mm's */
> > +	local_irq_disable();
> > +
> > +	sync_mm_rss(prev_mm);
> > +
> > +	vmacache_flush(tsk);
> > +
> > +	active_mm = tsk->active_mm;
> > +	if (active_mm != target_mm) {
> > +		mmgrab(target_mm);
> > +		tsk->active_mm = target_mm;
> > +	}
> > +	tsk->mm = target_mm;
> > +	switch_mm_irqs_off(active_mm, target_mm, tsk);
> > +	local_irq_enable();
> > +	task_unlock(tsk);
> > +#ifdef finish_arch_post_lock_switch
> > +	finish_arch_post_lock_switch();
> > +#endif
> > +
> > +	if (active_mm != target_mm)
> > +		mmdrop(active_mm);
> > +}
> 
> I think this should be unified with kthread_use_mm() somehow...

I agree.

> 
> And does it really need the "prev_mm" argument? It must be tsk->mm, no?

No, it doesn't. It is leftover of unuse_mm. BTW why do we pass mm to kthread_unuse_mm?

Thanks,
Andrei.
Jann Horn June 28, 2021, 4:13 p.m. UTC | #3
On Wed, Apr 14, 2021 at 7:59 AM Andrei Vagin <avagin@gmail.com> wrote:
> This change introduces the new system call:
> process_vm_exec(pid_t pid, struct sigcontext *uctx, unsigned long flags,
>                 siginfo_t * uinfo, sigset_t *sigmask, size_t sizemask)
>
> process_vm_exec allows to execute the current process in an address
> space of another process.
[...]

I still think that this whole API is fundamentally the wrong approach
because it tries to shoehorn multiple usecases with different
requirements into a single API. But that aside:

> +static void swap_mm(struct mm_struct *prev_mm, struct mm_struct *target_mm)
> +{
> +       struct task_struct *tsk = current;
> +       struct mm_struct *active_mm;
> +
> +       task_lock(tsk);
> +       /* Hold off tlb flush IPIs while switching mm's */
> +       local_irq_disable();
> +
> +       sync_mm_rss(prev_mm);
> +
> +       vmacache_flush(tsk);
> +
> +       active_mm = tsk->active_mm;
> +       if (active_mm != target_mm) {
> +               mmgrab(target_mm);
> +               tsk->active_mm = target_mm;
> +       }
> +       tsk->mm = target_mm;

I'm pretty sure you're not currently allowed to overwrite the ->mm
pointer of a userspace thread. For example, zap_threads() assumes that
all threads running under a process have the same ->mm. (And if you're
fiddling with ->mm stuff, you should probably CC linux-mm@.)

As far as I understand, only kthreads are allowed to do this (as
implemented in kthread_use_mm()).

> +       switch_mm_irqs_off(active_mm, target_mm, tsk);
> +       local_irq_enable();
> +       task_unlock(tsk);
> +#ifdef finish_arch_post_lock_switch
> +       finish_arch_post_lock_switch();
> +#endif
> +
> +       if (active_mm != target_mm)
> +               mmdrop(active_mm);
> +}
Andy Lutomirski June 28, 2021, 4:30 p.m. UTC | #4
On Mon, Jun 28, 2021, at 9:13 AM, Jann Horn wrote:
> On Wed, Apr 14, 2021 at 7:59 AM Andrei Vagin <avagin@gmail.com> wrote:
> > This change introduces the new system call:
> > process_vm_exec(pid_t pid, struct sigcontext *uctx, unsigned long flags,
> >                 siginfo_t * uinfo, sigset_t *sigmask, size_t sizemask)
> >
> > process_vm_exec allows to execute the current process in an address
> > space of another process.
> [...]
> 
> I still think that this whole API is fundamentally the wrong approach
> because it tries to shoehorn multiple usecases with different
> requirements into a single API. But that aside:
> 
> > +static void swap_mm(struct mm_struct *prev_mm, struct mm_struct *target_mm)
> > +{
> > +       struct task_struct *tsk = current;
> > +       struct mm_struct *active_mm;
> > +
> > +       task_lock(tsk);
> > +       /* Hold off tlb flush IPIs while switching mm's */
> > +       local_irq_disable();
> > +
> > +       sync_mm_rss(prev_mm);
> > +
> > +       vmacache_flush(tsk);
> > +
> > +       active_mm = tsk->active_mm;
> > +       if (active_mm != target_mm) {
> > +               mmgrab(target_mm);
> > +               tsk->active_mm = target_mm;
> > +       }
> > +       tsk->mm = target_mm;
> 
> I'm pretty sure you're not currently allowed to overwrite the ->mm
> pointer of a userspace thread. For example, zap_threads() assumes that
> all threads running under a process have the same ->mm. (And if you're
> fiddling with ->mm stuff, you should probably CC linux-mm@.)

exec_mmap() does it, so it can’t be entirely impossible.
Jann Horn June 28, 2021, 5:14 p.m. UTC | #5
On Mon, Jun 28, 2021 at 6:30 PM Andy Lutomirski <luto@kernel.org> wrote:
> On Mon, Jun 28, 2021, at 9:13 AM, Jann Horn wrote:
> > On Wed, Apr 14, 2021 at 7:59 AM Andrei Vagin <avagin@gmail.com> wrote:
> > > This change introduces the new system call:
> > > process_vm_exec(pid_t pid, struct sigcontext *uctx, unsigned long flags,
> > >                 siginfo_t * uinfo, sigset_t *sigmask, size_t sizemask)
> > >
> > > process_vm_exec allows to execute the current process in an address
> > > space of another process.
> > [...]
> >
> > I still think that this whole API is fundamentally the wrong approach
> > because it tries to shoehorn multiple usecases with different
> > requirements into a single API. But that aside:
> >
> > > +static void swap_mm(struct mm_struct *prev_mm, struct mm_struct *target_mm)
> > > +{
> > > +       struct task_struct *tsk = current;
> > > +       struct mm_struct *active_mm;
> > > +
> > > +       task_lock(tsk);
> > > +       /* Hold off tlb flush IPIs while switching mm's */
> > > +       local_irq_disable();
> > > +
> > > +       sync_mm_rss(prev_mm);
> > > +
> > > +       vmacache_flush(tsk);
> > > +
> > > +       active_mm = tsk->active_mm;
> > > +       if (active_mm != target_mm) {
> > > +               mmgrab(target_mm);
> > > +               tsk->active_mm = target_mm;
> > > +       }
> > > +       tsk->mm = target_mm;
> >
> > I'm pretty sure you're not currently allowed to overwrite the ->mm
> > pointer of a userspace thread. For example, zap_threads() assumes that
> > all threads running under a process have the same ->mm. (And if you're
> > fiddling with ->mm stuff, you should probably CC linux-mm@.)
>
> exec_mmap() does it, so it can’t be entirely impossible.

Yeah, true, execve can do it - I guess the thing that makes that
special is that it's running after de_thread(), so it's guaranteed to
be single-threaded?
Eric W. Biederman June 28, 2021, 6:18 p.m. UTC | #6
Jann Horn <jannh@google.com> writes:

> On Mon, Jun 28, 2021 at 6:30 PM Andy Lutomirski <luto@kernel.org> wrote:
>> On Mon, Jun 28, 2021, at 9:13 AM, Jann Horn wrote:
>> > On Wed, Apr 14, 2021 at 7:59 AM Andrei Vagin <avagin@gmail.com> wrote:
>> > > This change introduces the new system call:
>> > > process_vm_exec(pid_t pid, struct sigcontext *uctx, unsigned long flags,
>> > >                 siginfo_t * uinfo, sigset_t *sigmask, size_t sizemask)
>> > >
>> > > process_vm_exec allows to execute the current process in an address
>> > > space of another process.
>> > [...]
>> >
>> > I still think that this whole API is fundamentally the wrong approach
>> > because it tries to shoehorn multiple usecases with different
>> > requirements into a single API. But that aside:
>> >
>> > > +static void swap_mm(struct mm_struct *prev_mm, struct mm_struct *target_mm)
>> > > +{
>> > > +       struct task_struct *tsk = current;
>> > > +       struct mm_struct *active_mm;
>> > > +
>> > > +       task_lock(tsk);
>> > > +       /* Hold off tlb flush IPIs while switching mm's */
>> > > +       local_irq_disable();
>> > > +
>> > > +       sync_mm_rss(prev_mm);
>> > > +
>> > > +       vmacache_flush(tsk);
>> > > +
>> > > +       active_mm = tsk->active_mm;
>> > > +       if (active_mm != target_mm) {
>> > > +               mmgrab(target_mm);
>> > > +               tsk->active_mm = target_mm;
>> > > +       }
>> > > +       tsk->mm = target_mm;
>> >
>> > I'm pretty sure you're not currently allowed to overwrite the ->mm
>> > pointer of a userspace thread. For example, zap_threads() assumes that
>> > all threads running under a process have the same ->mm. (And if you're
>> > fiddling with ->mm stuff, you should probably CC linux-mm@.)
>>
>> exec_mmap() does it, so it can’t be entirely impossible.
>
> Yeah, true, execve can do it - I guess the thing that makes that
> special is that it's running after de_thread(), so it's guaranteed to
> be single-threaded?

Even the implementation detail of swapping the mm aside.  Even the idea
of swaping the mm is completely broken, as an endless system calls
depend upon the state held in task_struct.  io_uring just tried running
system calls of a process in a different context and we ultimately had
to make the threads part of the original process to make enough things
work to keep the problem tractable.

System calls deeply and fundamentally depend on task_struct and
signal_struct.

I can think of two possibilities.
1) Hijack and existing process thread.
2) Inject a new thread into an existing process.

Anything else is just an exercise in trouble.  Of this I think Hijacking
an existing thread is the only one that won't require lots of tracking
down of special cases.  I seem to remember audit is still struggling
with how to properly audit io_uring threads.

Eric
Andrei Vagin June 29, 2021, 1:01 a.m. UTC | #7
On Mon, Jun 28, 2021 at 01:18:07PM -0500, Eric W. Biederman wrote:
> Jann Horn <jannh@google.com> writes:
> 
> > On Mon, Jun 28, 2021 at 6:30 PM Andy Lutomirski <luto@kernel.org> wrote:
> >> On Mon, Jun 28, 2021, at 9:13 AM, Jann Horn wrote:
> >> > On Wed, Apr 14, 2021 at 7:59 AM Andrei Vagin <avagin@gmail.com> wrote:
> >> > > This change introduces the new system call:
> >> > > process_vm_exec(pid_t pid, struct sigcontext *uctx, unsigned long flags,
> >> > >                 siginfo_t * uinfo, sigset_t *sigmask, size_t sizemask)
> >> > >
> >> > > process_vm_exec allows to execute the current process in an address
> >> > > space of another process.
> >> > [...]
> >> >
> >> > I still think that this whole API is fundamentally the wrong approach
> >> > because it tries to shoehorn multiple usecases with different
> >> > requirements into a single API. But that aside:
> >> >
> >> > > +static void swap_mm(struct mm_struct *prev_mm, struct mm_struct *target_mm)
> >> > > +{
> >> > > +       struct task_struct *tsk = current;
> >> > > +       struct mm_struct *active_mm;
> >> > > +
> >> > > +       task_lock(tsk);
> >> > > +       /* Hold off tlb flush IPIs while switching mm's */
> >> > > +       local_irq_disable();
> >> > > +
> >> > > +       sync_mm_rss(prev_mm);
> >> > > +
> >> > > +       vmacache_flush(tsk);
> >> > > +
> >> > > +       active_mm = tsk->active_mm;
> >> > > +       if (active_mm != target_mm) {
> >> > > +               mmgrab(target_mm);
> >> > > +               tsk->active_mm = target_mm;
> >> > > +       }
> >> > > +       tsk->mm = target_mm;
> >> >
> >> > I'm pretty sure you're not currently allowed to overwrite the ->mm
> >> > pointer of a userspace thread. For example, zap_threads() assumes that
> >> > all threads running under a process have the same ->mm. (And if you're
> >> > fiddling with ->mm stuff, you should probably CC linux-mm@.)
> >>
> >> exec_mmap() does it, so it can’t be entirely impossible.
> >
> > Yeah, true, execve can do it - I guess the thing that makes that
> > special is that it's running after de_thread(), so it's guaranteed to
> > be single-threaded?
> 
> Even the implementation detail of swapping the mm aside.  Even the idea
> of swaping the mm is completely broken, as an endless system calls
> depend upon the state held in task_struct.  io_uring just tried running
> system calls of a process in a different context and we ultimately had
> to make the threads part of the original process to make enough things
> work to keep the problem tractable.
> 
> System calls deeply and fundamentally depend on task_struct and
> signal_struct.

In opposite to io_uring, process_vm_exec doesn't intend to run system
calls in the context of the target process. We initially declare that
system calls are executed in the context of the current process with
just another mm. If we are talking about user-mode kernels, they will
need just two system calls: mmap and munmap. In case of CRIU, vmsplice
will be used too.

> 
> I can think of two possibilities.
> 1) Hijack and existing process thread.
> 2) Inject a new thread into an existing process.

I am not sure that I understand what you mean here, but it sounds like
we will need to do a context switch to execute anything in a context
of a hijacked thread. If I am right, it kills the main idea of
process_vm_exec. If I misunderstand your idea, maybe you can describe it
with more details.

Thanks,
Andrei
Andrei Vagin July 2, 2021, 6:22 a.m. UTC | #8
On Mon, Jun 28, 2021 at 06:13:29PM +0200, Jann Horn wrote:
> On Wed, Apr 14, 2021 at 7:59 AM Andrei Vagin <avagin@gmail.com> wrote:
> > This change introduces the new system call:
> > process_vm_exec(pid_t pid, struct sigcontext *uctx, unsigned long flags,
> >                 siginfo_t * uinfo, sigset_t *sigmask, size_t sizemask)
> >
> > process_vm_exec allows to execute the current process in an address
> > space of another process.
> [...]
> 
> I still think that this whole API is fundamentally the wrong approach
> because it tries to shoehorn multiple usecases with different
> requirements into a single API. But that aside:

Here, I can't agree with you, but this is discussed in the parallel
thread.

> 
> > +static void swap_mm(struct mm_struct *prev_mm, struct mm_struct *target_mm)
> > +{
> > +       struct task_struct *tsk = current;
> > +       struct mm_struct *active_mm;
> > +
> > +       task_lock(tsk);
> > +       /* Hold off tlb flush IPIs while switching mm's */
> > +       local_irq_disable();
> > +
> > +       sync_mm_rss(prev_mm);
> > +
> > +       vmacache_flush(tsk);
> > +
> > +       active_mm = tsk->active_mm;
> > +       if (active_mm != target_mm) {
> > +               mmgrab(target_mm);
> > +               tsk->active_mm = target_mm;
> > +       }
> > +       tsk->mm = target_mm;
> 
> I'm pretty sure you're not currently allowed to overwrite the ->mm
> pointer of a userspace thread. For example, zap_threads() assumes that
> all threads running under a process have the same ->mm. (And if you're
> fiddling with ->mm stuff, you should probably CC linux-mm@.)
> 
> As far as I understand, only kthreads are allowed to do this (as
> implemented in kthread_use_mm()).

kthread_use_mm() was renamed from use_mm in the v5.8 kernel. Before
that, it wasn't used for user processes in the kernel, but it was
exported for modules, and we used it without any visible problems. We
understood that there could be some issues like zap_threads and it was
one of reasons why we decided to introduce this system call.

I understand that there are no places in the kernel where we change mm
of user threads back and forth, but are there any real concerns why we
should not do that? I agree that zap_threads should be fixed, but it
will the easy one.

> 
> > +       switch_mm_irqs_off(active_mm, target_mm, tsk);
> > +       local_irq_enable();
> > +       task_unlock(tsk);
> > +#ifdef finish_arch_post_lock_switch
> > +       finish_arch_post_lock_switch();
> > +#endif
> > +
> > +       if (active_mm != target_mm)
> > +               mmdrop(active_mm);
> > +}
Peter Zijlstra July 2, 2021, 8:51 a.m. UTC | #9
I'm terrified of all of this...

On Tue, Apr 13, 2021 at 10:52:15PM -0700, Andrei Vagin wrote:

> +long swap_vm_exec_context(struct sigcontext __user *uctx)
> +{
> +	struct sigcontext ctx = {};
> +	sigset_t set = {};
> +
> +
> +	if (copy_from_user(&ctx, uctx, CONTEXT_COPY_SIZE))
> +		return -EFAULT;
> +	/* A floating point state is managed from user-space. */
> +	if (ctx.fpstate != 0)
> +		return -EINVAL;
> +	if (!user_access_begin(uctx, sizeof(*uctx)))
> +		return -EFAULT;
> +	unsafe_put_sigcontext(uctx, NULL, current_pt_regs(), (&set), Efault);
> +	user_access_end();

But here you save the sigcontext without FPU state.

> +
> +	if (__restore_sigcontext(current_pt_regs(), &ctx, 0))
> +		goto badframe;

And here you restore sigcontext, *with* FPU state.  At which point your
FPU state is irrecoverably lost.

Also, I'm not at all convinced this can ever do the right thing when the
tasks don't agree on what the FPU state is. I suppose in the best case
the save will EFAULT.

> +
> +	return 0;
> +Efault:
> +	user_access_end();
> +badframe:
> +	signal_fault(current_pt_regs(), uctx, "swap_vm_exec_context");
> +	return -EFAULT;
> +}
Jann Horn July 2, 2021, 11:51 a.m. UTC | #10
On Fri, Jul 2, 2021 at 8:25 AM Andrei Vagin <avagin@gmail.com> wrote:
> On Mon, Jun 28, 2021 at 06:13:29PM +0200, Jann Horn wrote:
> > On Wed, Apr 14, 2021 at 7:59 AM Andrei Vagin <avagin@gmail.com> wrote:
> > > +static void swap_mm(struct mm_struct *prev_mm, struct mm_struct *target_mm)
> > > +{
> > > +       struct task_struct *tsk = current;
> > > +       struct mm_struct *active_mm;
> > > +
> > > +       task_lock(tsk);
> > > +       /* Hold off tlb flush IPIs while switching mm's */
> > > +       local_irq_disable();
> > > +
> > > +       sync_mm_rss(prev_mm);
> > > +
> > > +       vmacache_flush(tsk);
> > > +
> > > +       active_mm = tsk->active_mm;
> > > +       if (active_mm != target_mm) {
> > > +               mmgrab(target_mm);
> > > +               tsk->active_mm = target_mm;
> > > +       }
> > > +       tsk->mm = target_mm;
> >
> > I'm pretty sure you're not currently allowed to overwrite the ->mm
> > pointer of a userspace thread. For example, zap_threads() assumes that
> > all threads running under a process have the same ->mm. (And if you're
> > fiddling with ->mm stuff, you should probably CC linux-mm@.)
> >
> > As far as I understand, only kthreads are allowed to do this (as
> > implemented in kthread_use_mm()).
>
> kthread_use_mm() was renamed from use_mm in the v5.8 kernel. Before
> that, it wasn't used for user processes in the kernel, but it was
> exported for modules, and we used it without any visible problems. We
> understood that there could be some issues like zap_threads and it was
> one of reasons why we decided to introduce this system call.
>
> I understand that there are no places in the kernel where we change mm
> of user threads back and forth, but are there any real concerns why we
> should not do that? I agree that zap_threads should be fixed, but it
> will the easy one.

My point is that if you break a preexisting assumption like this,
you'll have to go through the kernel and search for places that rely
on this assumption, and fix them up, which may potentially require
thinking about what kinds of semantics would actually be appropriate
there. Like the MCE killing logic (collect_procs_anon() and such). And
current_is_single_threaded(), in which the current patch probably
leads to logic security bugs. And __uprobe_perf_filter(). Before my
refactoring of the ELF coredump logic in kernel 5.10 (commit
b2767d97f5ff75 and the ones before it), you'd have also probably
created memory corruption bugs in races between elf_core_dump() and
syscalls like mmap()/munmap(). (Note that this is not necessarily an
exhaustive list.)
Andy Lutomirski July 2, 2021, 8:40 p.m. UTC | #11
On Fri, Jul 2, 2021, at 4:51 AM, Jann Horn wrote:
> On Fri, Jul 2, 2021 at 8:25 AM Andrei Vagin <avagin@gmail.com> wrote:
> > On Mon, Jun 28, 2021 at 06:13:29PM +0200, Jann Horn wrote:
> > > On Wed, Apr 14, 2021 at 7:59 AM Andrei Vagin <avagin@gmail.com> wrote:
> > > > +static void swap_mm(struct mm_struct *prev_mm, struct mm_struct *target_mm)
> > > > +{
> > > > +       struct task_struct *tsk = current;
> > > > +       struct mm_struct *active_mm;
> > > > +
> > > > +       task_lock(tsk);
> > > > +       /* Hold off tlb flush IPIs while switching mm's */
> > > > +       local_irq_disable();
> > > > +
> > > > +       sync_mm_rss(prev_mm);
> > > > +
> > > > +       vmacache_flush(tsk);
> > > > +
> > > > +       active_mm = tsk->active_mm;
> > > > +       if (active_mm != target_mm) {
> > > > +               mmgrab(target_mm);
> > > > +               tsk->active_mm = target_mm;
> > > > +       }
> > > > +       tsk->mm = target_mm;
> > >
> > > I'm pretty sure you're not currently allowed to overwrite the ->mm
> > > pointer of a userspace thread. For example, zap_threads() assumes that
> > > all threads running under a process have the same ->mm. (And if you're
> > > fiddling with ->mm stuff, you should probably CC linux-mm@.)
> > >
> > > As far as I understand, only kthreads are allowed to do this (as
> > > implemented in kthread_use_mm()).
> >
> > kthread_use_mm() was renamed from use_mm in the v5.8 kernel. Before
> > that, it wasn't used for user processes in the kernel, but it was
> > exported for modules, and we used it without any visible problems. We
> > understood that there could be some issues like zap_threads and it was
> > one of reasons why we decided to introduce this system call.
> >
> > I understand that there are no places in the kernel where we change mm
> > of user threads back and forth, but are there any real concerns why we
> > should not do that? I agree that zap_threads should be fixed, but it
> > will the easy one.
> 
> My point is that if you break a preexisting assumption like this,
> you'll have to go through the kernel and search for places that rely
> on this assumption, and fix them up, which may potentially require
> thinking about what kinds of semantics would actually be appropriate
> there. Like the MCE killing logic (collect_procs_anon() and such). And
> current_is_single_threaded(), in which the current patch probably
> leads to logic security bugs. And __uprobe_perf_filter(). Before my
> refactoring of the ELF coredump logic in kernel 5.10 (commit
> b2767d97f5ff75 and the ones before it), you'd have also probably
> created memory corruption bugs in races between elf_core_dump() and
> syscalls like mmap()/munmap(). (Note that this is not necessarily an
> exhaustive list.)
> 

There’s nmi_uaccess_okay(), and its callers assume that, when a task is perf tracing itself, that an event on that task with nmi_uaccess_okay() means that uaccess will access that task’s memory.

Core dump code probably expects that dumping memory will access the correct mm.

I cannot fathom why any kind of remote vm access touched FPU state at all.

What PKRU value is supposed to be used when doing mm swap shenanigans?  How about PASID?

What happens if one task attempts to issue a KVM ioctl while its mm is swapped?
Jann Horn July 2, 2021, 8:56 p.m. UTC | #12
On Wed, Apr 14, 2021 at 7:59 AM Andrei Vagin <avagin@gmail.com> wrote:
> This change introduces the new system call:
> process_vm_exec(pid_t pid, struct sigcontext *uctx, unsigned long flags,
>                 siginfo_t * uinfo, sigset_t *sigmask, size_t sizemask)
>
> process_vm_exec allows to execute the current process in an address
> space of another process.
>
> process_vm_exec swaps the current address space with an address space of
> a specified process, sets a state from sigcontex and resumes the process.
> When a process receives a signal or calls a system call,
> process_vm_exec saves the process state back to sigcontext, restores the
> origin address space, restores the origin process state, and returns to
> userspace.
>
> If it was interrupted by a signal and the signal is in the user_mask,
> the signal is dequeued and information about it is saved in uinfo.
> If process_vm_exec is interrupted by a system call, a synthetic siginfo
> for the SIGSYS signal is generated.
>
> The behavior of this system call is similar to PTRACE_SYSEMU but
> everything is happing in the context of one process, so
> process_vm_exec shows a better performance.
>
> PTRACE_SYSEMU is primarily used to implement sandboxes (application
> kernels) like User-mode Linux or gVisor. These type of sandboxes
> intercepts applications system calls and acts as the guest kernel.
> A simple benchmark, where a "tracee" process executes systems calls in a
> loop and a "tracer" process traps syscalls and handles them just
> incrementing the tracee instruction pointer to skip the syscall
> instruction shows that process_vm_exec works more than 5 times faster
> than PTRACE_SYSEMU.
[...]
> +long swap_vm_exec_context(struct sigcontext __user *uctx)
> +{
> +       struct sigcontext ctx = {};
> +       sigset_t set = {};
> +
> +
> +       if (copy_from_user(&ctx, uctx, CONTEXT_COPY_SIZE))
> +               return -EFAULT;
> +       /* A floating point state is managed from user-space. */
> +       if (ctx.fpstate != 0)
> +               return -EINVAL;
> +       if (!user_access_begin(uctx, sizeof(*uctx)))
> +               return -EFAULT;
> +       unsafe_put_sigcontext(uctx, NULL, current_pt_regs(), (&set), Efault);
> +       user_access_end();
> +
> +       if (__restore_sigcontext(current_pt_regs(), &ctx, 0))
> +               goto badframe;
> +
> +       return 0;
> +Efault:
> +       user_access_end();
> +badframe:
> +       signal_fault(current_pt_regs(), uctx, "swap_vm_exec_context");
> +       return -EFAULT;
> +}

Comparing the pieces of context that restore_sigcontext() restores
with what a normal task switch does (see __switch_to() and callees), I
noticed: On CPUs with FSGSBASE support, I think sandboxed code could
overwrite FSBASE/GSBASE using the WRFSBASE/WRGSBASE instructions,
causing the supervisor to access attacker-controlled addresses when it
tries to access a thread-local variable like "errno"? Signal handling
saves the segment registers, but not the FS/GS base addresses.


jannh@laptop:~/test$ cat signal_gsbase.c
// compile with -mfsgsbase
#include <stdio.h>
#include <signal.h>
#include <immintrin.h>

void signal_handler(int sig, siginfo_t *info, void *ucontext_) {
  puts("signal handler");
  _writegsbase_u64(0x12345678);
}

int main(void) {
  struct sigaction new_act = {
    .sa_sigaction = signal_handler,
    .sa_flags = SA_SIGINFO
  };
  sigaction(SIGUSR1, &new_act, NULL);

  printf("original gsbase is 0x%lx\n", _readgsbase_u64());
  raise(SIGUSR1);
  printf("post-signal gsbase is 0x%lx\n", _readgsbase_u64());
}
jannh@laptop:~/test$ gcc -o signal_gsbase signal_gsbase.c -mfsgsbase
jannh@laptop:~/test$ ./signal_gsbase
original gsbase is 0x0
signal handler
post-signal gsbase is 0x12345678
jannh@laptop:~/test$


So to make this usable for a sandboxing usecase, you'd also have to
save and restore FSBASE/GSBASE, just like __switch_to().
Andrei Vagin July 2, 2021, 10:21 p.m. UTC | #13
On Fri, Jul 02, 2021 at 10:51:13AM +0200, Peter Zijlstra wrote:
> 
> I'm terrified of all of this...
> 
> On Tue, Apr 13, 2021 at 10:52:15PM -0700, Andrei Vagin wrote:
> 
> > +long swap_vm_exec_context(struct sigcontext __user *uctx)
> > +{
> > +	struct sigcontext ctx = {};
> > +	sigset_t set = {};
> > +
> > +
> > +	if (copy_from_user(&ctx, uctx, CONTEXT_COPY_SIZE))
> > +		return -EFAULT;
> > +	/* A floating point state is managed from user-space. */
> > +	if (ctx.fpstate != 0)
> > +		return -EINVAL;

Here, we check that ctx doesn't have an FPU state.

> > +	if (!user_access_begin(uctx, sizeof(*uctx)))
> > +		return -EFAULT;
> > +	unsafe_put_sigcontext(uctx, NULL, current_pt_regs(), (&set), Efault);
> > +	user_access_end();
> 
> But here you save the sigcontext without FPU state.
> 
> > +
> > +	if (__restore_sigcontext(current_pt_regs(), &ctx, 0))
> > +		goto badframe;
> 
> And here you restore sigcontext, *with* FPU state.  At which point your
> FPU state is irrecoverably lost.

process_vm_exec doesn't change a process FPU state. Unlike signals, here
we can control it from a user-space. A process can set an FPU state
before process_vm_exec and then retore its FPU state after the
call.

This version of patches has a bug that I fixed in my tree when I
implemented the user-space part for gVisor. I didn't take into account
that restore_sigcontext(ctx) clears a process fpu state if ctx->fpstate
is zero. I moved fpu__restore_sig out from __restore_sigcontext to fix
this issue:

https://github.com/avagin/linux-task-diag/commit/55b7194d00ff

> 
> Also, I'm not at all convinced this can ever do the right thing when the
> tasks don't agree on what the FPU state is. I suppose in the best case
> the save will EFAULT.
> 
> > +
> > +	return 0;
> > +Efault:
> > +	user_access_end();
> > +badframe:
> > +	signal_fault(current_pt_regs(), uctx, "swap_vm_exec_context");
> > +	return -EFAULT;
> > +}
Andrei Vagin July 2, 2021, 10:48 p.m. UTC | #14
On Fri, Jul 02, 2021 at 10:56:38PM +0200, Jann Horn wrote:
> On Wed, Apr 14, 2021 at 7:59 AM Andrei Vagin <avagin@gmail.com> wrote:
> > This change introduces the new system call:
> > process_vm_exec(pid_t pid, struct sigcontext *uctx, unsigned long flags,
> >                 siginfo_t * uinfo, sigset_t *sigmask, size_t sizemask)
> >
> > process_vm_exec allows to execute the current process in an address
> > space of another process.
> >
> > process_vm_exec swaps the current address space with an address space of
> > a specified process, sets a state from sigcontex and resumes the process.
> > When a process receives a signal or calls a system call,
> > process_vm_exec saves the process state back to sigcontext, restores the
> > origin address space, restores the origin process state, and returns to
> > userspace.
> >
> > If it was interrupted by a signal and the signal is in the user_mask,
> > the signal is dequeued and information about it is saved in uinfo.
> > If process_vm_exec is interrupted by a system call, a synthetic siginfo
> > for the SIGSYS signal is generated.
> >
> > The behavior of this system call is similar to PTRACE_SYSEMU but
> > everything is happing in the context of one process, so
> > process_vm_exec shows a better performance.
> >
> > PTRACE_SYSEMU is primarily used to implement sandboxes (application
> > kernels) like User-mode Linux or gVisor. These type of sandboxes
> > intercepts applications system calls and acts as the guest kernel.
> > A simple benchmark, where a "tracee" process executes systems calls in a
> > loop and a "tracer" process traps syscalls and handles them just
> > incrementing the tracee instruction pointer to skip the syscall
> > instruction shows that process_vm_exec works more than 5 times faster
> > than PTRACE_SYSEMU.
> [...]
> > +long swap_vm_exec_context(struct sigcontext __user *uctx)
> > +{
> > +       struct sigcontext ctx = {};
> > +       sigset_t set = {};
> > +
> > +
> > +       if (copy_from_user(&ctx, uctx, CONTEXT_COPY_SIZE))
> > +               return -EFAULT;
> > +       /* A floating point state is managed from user-space. */
> > +       if (ctx.fpstate != 0)
> > +               return -EINVAL;
> > +       if (!user_access_begin(uctx, sizeof(*uctx)))
> > +               return -EFAULT;
> > +       unsafe_put_sigcontext(uctx, NULL, current_pt_regs(), (&set), Efault);
> > +       user_access_end();
> > +
> > +       if (__restore_sigcontext(current_pt_regs(), &ctx, 0))
> > +               goto badframe;
> > +
> > +       return 0;
> > +Efault:
> > +       user_access_end();
> > +badframe:
> > +       signal_fault(current_pt_regs(), uctx, "swap_vm_exec_context");
> > +       return -EFAULT;
> > +}
> 
> Comparing the pieces of context that restore_sigcontext() restores
> with what a normal task switch does (see __switch_to() and callees), I
> noticed: On CPUs with FSGSBASE support, I think sandboxed code could
> overwrite FSBASE/GSBASE using the WRFSBASE/WRGSBASE instructions,
> causing the supervisor to access attacker-controlled addresses when it
> tries to access a thread-local variable like "errno"? Signal handling
> saves the segment registers, but not the FS/GS base addresses.
> 
> 
> jannh@laptop:~/test$ cat signal_gsbase.c
> // compile with -mfsgsbase
> #include <stdio.h>
> #include <signal.h>
> #include <immintrin.h>
> 
> void signal_handler(int sig, siginfo_t *info, void *ucontext_) {
>   puts("signal handler");
>   _writegsbase_u64(0x12345678);
> }
> 
> int main(void) {
>   struct sigaction new_act = {
>     .sa_sigaction = signal_handler,
>     .sa_flags = SA_SIGINFO
>   };
>   sigaction(SIGUSR1, &new_act, NULL);
> 
>   printf("original gsbase is 0x%lx\n", _readgsbase_u64());
>   raise(SIGUSR1);
>   printf("post-signal gsbase is 0x%lx\n", _readgsbase_u64());
> }
> jannh@laptop:~/test$ gcc -o signal_gsbase signal_gsbase.c -mfsgsbase
> jannh@laptop:~/test$ ./signal_gsbase
> original gsbase is 0x0
> signal handler
> post-signal gsbase is 0x12345678
> jannh@laptop:~/test$
> 
> 
> So to make this usable for a sandboxing usecase, you'd also have to
> save and restore FSBASE/GSBASE, just like __switch_to().

You are right. I've found this too when I implemented the gviosr user-space
part.

Here is the tree whether this problem has been fixed:
https://github.com/avagin/linux-task-diag/commits/wip/gvisor-5.10
diff mbox series

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index ba4e966484ab..3ed9b8fb1727 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -514,6 +514,21 @@  config SECCOMP_FILTER
 
 	  See Documentation/userspace-api/seccomp_filter.rst for details.
 
+config HAVE_ARCH_PROCESS_VM_EXEC
+	bool
+	help
+	  An arch should select this symbol to support the process_vm_exec system call.
+
+config PROCESS_VM_EXEC
+	prompt "Enable the process_vm_exec syscall"
+	def_bool y
+	depends on HAVE_ARCH_PROCESS_VM_EXEC
+	help
+	  process_vm_exec allows executing code and system calls in a specified
+	  address space.
+
+	  If unsure, say Y.
+
 config HAVE_ARCH_STACKLEAK
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fbf26e0f7a6a..1c7ebb58865e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@  config X86_64
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_SOFT_DIRTY
+	select HAVE_ARCH_PROCESS_VM_EXEC
 	select MODULES_USE_ELF_RELA
 	select NEED_DMA_MAP_STATE
 	select SWIOTLB
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 870efeec8bda..42eac459b25b 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -19,6 +19,7 @@ 
 #include <linux/nospec.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/process_vm_exec.h>
 
 #ifdef CONFIG_XEN_PV
 #include <xen/xen-ops.h>
@@ -38,6 +39,21 @@ 
 #ifdef CONFIG_X86_64
 __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
 {
+#ifdef CONFIG_PROCESS_VM_EXEC
+	if (current->exec_mm && current->exec_mm->ctx) {
+		kernel_siginfo_t info = {
+			.si_signo = SIGSYS,
+			.si_call_addr = (void __user *)KSTK_EIP(current),
+			.si_arch = syscall_get_arch(current),
+			.si_syscall = nr,
+		};
+		restore_vm_exec_context(regs);
+		regs->ax = copy_siginfo_to_user(current->exec_mm->siginfo, &info);
+		syscall_exit_to_user_mode(regs);
+		return;
+	}
+#endif
+
 	nr = syscall_enter_from_user_mode(regs, nr);
 
 	instrumentation_begin();
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 379819244b91..2a8e27b2d87e 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -362,6 +362,7 @@ 
 438	common	pidfd_getfd		sys_pidfd_getfd
 439	common	faccessat2		sys_faccessat2
 440	common	process_madvise		sys_process_madvise
+441	64	process_vm_exec		sys_process_vm_exec
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 140d890c2c98..e390410cc3e9 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -6,4 +6,6 @@ 
 
 #include <uapi/asm/sigcontext.h>
 
+extern long swap_vm_exec_context(struct sigcontext __user *uctx);
+
 #endif /* _ASM_X86_SIGCONTEXT_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 68608bd892c0..d053289fd19e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -163,3 +163,4 @@  ifeq ($(CONFIG_X86_64),y)
 endif
 
 obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT)	+= ima_arch.o
+obj-$(CONFIG_PROCESS_VM_EXEC)	+= process_vm_exec.o
diff --git a/arch/x86/kernel/process_vm_exec.c b/arch/x86/kernel/process_vm_exec.c
new file mode 100644
index 000000000000..28b32330f744
--- /dev/null
+++ b/arch/x86/kernel/process_vm_exec.c
@@ -0,0 +1,133 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/syscall.h>
+#include <asm/sigframe.h>
+#include <asm/signal.h>
+#include <asm/mmu_context.h>
+#include <asm/sigcontext.h>
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/syscalls.h>
+#include <linux/vmacache.h>
+#include <linux/process_vm_exec.h>
+
+static void swap_mm(struct mm_struct *prev_mm, struct mm_struct *target_mm)
+{
+	struct task_struct *tsk = current;
+	struct mm_struct *active_mm;
+
+	task_lock(tsk);
+	/* Hold off tlb flush IPIs while switching mm's */
+	local_irq_disable();
+
+	sync_mm_rss(prev_mm);
+
+	vmacache_flush(tsk);
+
+	active_mm = tsk->active_mm;
+	if (active_mm != target_mm) {
+		mmgrab(target_mm);
+		tsk->active_mm = target_mm;
+	}
+	tsk->mm = target_mm;
+	switch_mm_irqs_off(active_mm, target_mm, tsk);
+	local_irq_enable();
+	task_unlock(tsk);
+#ifdef finish_arch_post_lock_switch
+	finish_arch_post_lock_switch();
+#endif
+
+	if (active_mm != target_mm)
+		mmdrop(active_mm);
+}
+
+void restore_vm_exec_context(struct pt_regs *regs)
+{
+	struct sigcontext __user *uctx;
+	struct mm_struct *prev_mm, *target_mm;
+
+	uctx = current->exec_mm->ctx;
+	current->exec_mm->ctx = NULL;
+
+	target_mm = current->exec_mm->mm;
+	current->exec_mm->mm = NULL;
+	prev_mm = current->mm;
+
+	swap_mm(prev_mm, target_mm);
+
+	mmput(prev_mm);
+	mmdrop(target_mm);
+
+	swap_vm_exec_context(uctx);
+}
+
+SYSCALL_DEFINE6(process_vm_exec, pid_t, pid, struct sigcontext __user *, uctx,
+		unsigned long, flags, siginfo_t __user *, uinfo,
+		sigset_t __user *, user_mask, size_t, sizemask)
+{
+	struct mm_struct *prev_mm, *mm;
+	struct task_struct *tsk;
+	long ret = -ESRCH;
+
+	sigset_t mask;
+
+	if (flags)
+		return -EINVAL;
+
+	if (sizemask != sizeof(sigset_t))
+		return -EINVAL;
+	if (copy_from_user(&mask, user_mask, sizeof(mask)))
+		return -EFAULT;
+
+	sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+	signotset(&mask);
+
+	tsk = find_get_task_by_vpid(pid);
+	if (!tsk) {
+		ret = -ESRCH;
+		goto err;
+	}
+	mm = mm_access(tsk, PTRACE_MODE_ATTACH_REALCREDS);
+	put_task_struct(tsk);
+	if (!mm || IS_ERR(mm)) {
+		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+		goto err;
+	}
+
+	current_pt_regs()->ax = 0;
+	ret = swap_vm_exec_context(uctx);
+	if (ret < 0)
+		goto err_mm_put;
+
+	if (!current->exec_mm) {
+		ret = -ENOMEM;
+		current->exec_mm = kmalloc(sizeof(*current->exec_mm), GFP_KERNEL);
+		if (current->exec_mm == NULL)
+			goto err_mm_put;
+	}
+	current->exec_mm->ctx = uctx;
+	current->exec_mm->mm = current->mm;
+	current->exec_mm->flags = flags;
+	current->exec_mm->sigmask = mask;
+	current->exec_mm->siginfo = uinfo;
+	prev_mm = current->mm;
+
+	mmgrab(prev_mm);
+	swap_mm(prev_mm, mm);
+
+	ret = current_pt_regs()->ax;
+
+	return ret;
+err_mm_put:
+	mmput(mm);
+err:
+	return ret;
+}
+
+void free_exec_mm_struct(struct task_struct *p)
+{
+	kfree(p->exec_mm);
+	p->exec_mm = NULL;
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cc269a20dd5f..51286c79062b 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -27,6 +27,7 @@ 
 #include <linux/context_tracking.h>
 #include <linux/entry-common.h>
 #include <linux/syscalls.h>
+#include <linux/process_vm_exec.h>
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
@@ -816,6 +817,23 @@  void arch_do_signal(struct pt_regs *regs)
 {
 	struct ksignal ksig;
 
+#ifdef CONFIG_PROCESS_VM_EXEC
+	if (current->exec_mm && current->exec_mm->ctx) {
+		kernel_siginfo_t info;
+		int ret;
+
+		restore_vm_exec_context(current_pt_regs());
+
+		spin_lock_irq(&current->sighand->siglock);
+		ret = dequeue_signal(current, &current->exec_mm->sigmask, &info);
+		spin_unlock_irq(&current->sighand->siglock);
+
+		if (ret > 0)
+			ret = copy_siginfo_to_user(current->exec_mm->siginfo, &info);
+		regs->ax = ret;
+	}
+#endif
+
 	if (get_signal(&ksig)) {
 		/* Whee! Actually deliver the signal.  */
 		handle_signal(&ksig, regs);
@@ -896,3 +914,32 @@  COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
 	return 0;
 }
 #endif
+
+#ifdef CONFIG_PROCESS_VM_EXEC
+long swap_vm_exec_context(struct sigcontext __user *uctx)
+{
+	struct sigcontext ctx = {};
+	sigset_t set = {};
+
+
+	if (copy_from_user(&ctx, uctx, CONTEXT_COPY_SIZE))
+		return -EFAULT;
+	/* A floating point state is managed from user-space. */
+	if (ctx.fpstate != 0)
+		return -EINVAL;
+	if (!user_access_begin(uctx, sizeof(*uctx)))
+		return -EFAULT;
+	unsafe_put_sigcontext(uctx, NULL, current_pt_regs(), (&set), Efault);
+	user_access_end();
+
+	if (__restore_sigcontext(current_pt_regs(), &ctx, 0))
+		goto badframe;
+
+	return 0;
+Efault:
+	user_access_end();
+badframe:
+	signal_fault(current_pt_regs(), uctx, "swap_vm_exec_context");
+	return -EFAULT;
+}
+#endif
diff --git a/include/linux/process_vm_exec.h b/include/linux/process_vm_exec.h
new file mode 100644
index 000000000000..a02535fbd5c8
--- /dev/null
+++ b/include/linux/process_vm_exec.h
@@ -0,0 +1,15 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PROCESS_VM_EXEC_H
+#define _LINUX_PROCESS_VM_EXEC_H
+
+struct exec_mm {
+	struct sigcontext *ctx;
+	struct mm_struct *mm;
+	unsigned long flags;
+	sigset_t sigmask;
+	siginfo_t __user *siginfo;
+};
+
+void free_exec_mm_struct(struct task_struct *tsk);
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 76cd21fa5501..864a8fdd0ed7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -64,6 +64,7 @@  struct signal_struct;
 struct task_delay_info;
 struct task_group;
 struct io_uring_task;
+struct exec_mm;
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -637,6 +638,8 @@  struct wake_q_node {
 	struct wake_q_node *next;
 };
 
+struct exec_mm;
+
 struct task_struct {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	/*
@@ -757,6 +760,10 @@  struct task_struct {
 	struct mm_struct		*mm;
 	struct mm_struct		*active_mm;
 
+#ifdef CONFIG_PROCESS_VM_EXEC
+	struct exec_mm			*exec_mm;
+#endif
+
 	/* Per-thread vma caching: */
 	struct vmacache			vmacache;
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 37bea07c12f2..bdea75a14975 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1347,4 +1347,10 @@  int __sys_getsockopt(int fd, int level, int optname, char __user *optval,
 		int __user *optlen);
 int __sys_setsockopt(int fd, int level, int optname, char __user *optval,
 		int optlen);
+
+#ifdef CONFIG_PROCESS_VM_EXEC
+void restore_vm_exec_context(struct pt_regs *regs);
+#else
+static inline void restore_vm_exec_context(struct pt_regs *regs) {}
+#endif
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 2056318988f7..60acbd9cf511 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -859,9 +859,11 @@  __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 __SYSCALL(__NR_faccessat2, sys_faccessat2)
 #define __NR_process_madvise 440
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
+#define __NR_process_madvise 441
+__SYSCALL(__NR_process_vm_exec, sys_process_vm_exec)
 
 #undef __NR_syscalls
-#define __NR_syscalls 441
+#define __NR_syscalls 442
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d266388d380..61ca7a4a1130 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -96,6 +96,7 @@ 
 #include <linux/kasan.h>
 #include <linux/scs.h>
 #include <linux/io_uring.h>
+#include <linux/process_vm_exec.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -461,6 +462,9 @@  void free_task(struct task_struct *tsk)
 	arch_release_task_struct(tsk);
 	if (tsk->flags & PF_KTHREAD)
 		free_kthread_struct(tsk);
+#ifdef CONFIG_PROCESS_VM_EXEC
+	free_exec_mm_struct(tsk);
+#endif
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -943,6 +947,11 @@  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 #ifdef CONFIG_MEMCG
 	tsk->active_memcg = NULL;
 #endif
+
+#ifdef CONFIG_PROCESS_VM_EXEC
+	tsk->exec_mm = NULL;
+#endif
+
 	return tsk;
 
 free_stack:
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index f27ac94d5fa7..2545a409bb07 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -350,6 +350,8 @@  COND_SYSCALL(pkey_mprotect);
 COND_SYSCALL(pkey_alloc);
 COND_SYSCALL(pkey_free);
 
+/* execute in another address space */
+COND_SYSCALL(process_vm_exec);
 
 /*
  * Architecture specific weak syscall entries.