diff mbox series

sched/x86: Save [ER]FLAGS on context switch

Message ID 20190214101429.GD32494@hirez.programming.kicks-ass.net
State New
Headers show
Series sched/x86: Save [ER]FLAGS on context switch | expand

Commit Message

Peter Zijlstra Feb. 14, 2019, 10:14 a.m. UTC
On Wed, Feb 13, 2019 at 02:49:47PM -0800, Andy Lutomirski wrote:

> Do we need to backport this thing?

Possibly, just to be safe.

> The problem can’t be too widespread or we would have heard of it before.

Yes, so far we've been lucky.

---
Subject: sched/x86: Save [ER]FLAGS on context switch
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Feb 14 10:30:52 CET 2019

Effectively revert commit:

  2c7577a75837 ("sched/x86_64: Don't save flags on context switch")

Specifically because SMAP uses FLAGS.AC which invalidates the claim
that the kernel has clean flags.

In particular; while preemption from interrupt return is fine (the
IRET frame on the exception stack contains FLAGS) it breaks any code
that does synchonous scheduling, including preempt_enable().

This has become a significant issue ever since commit:

  5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses")

provided for means of having 'normal' C code between STAC / CLAC,
exposing the FLAGS.AC state. So far this hasn't led to trouble,
however fix it before it comes apart.

Fixes: 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses")
Acked-by: Andy Lutomirski <luto@amacapital.net>
Reported-by: Julien Thierry <julien.thierry@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/entry/entry_32.S        |    2 ++
 arch/x86/entry/entry_64.S        |    2 ++
 arch/x86/include/asm/switch_to.h |    1 +
 3 files changed, 5 insertions(+)

Comments

Brian Gerst Feb. 14, 2019, 4:18 p.m. UTC | #1
On Thu, Feb 14, 2019 at 5:14 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Feb 13, 2019 at 02:49:47PM -0800, Andy Lutomirski wrote:
>
> > Do we need to backport this thing?
>
> Possibly, just to be safe.
>
> > The problem can’t be too widespread or we would have heard of it before.
>
> Yes, so far we've been lucky.
>
> ---
> Subject: sched/x86: Save [ER]FLAGS on context switch
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Thu Feb 14 10:30:52 CET 2019
>
> Effectively revert commit:
>
>   2c7577a75837 ("sched/x86_64: Don't save flags on context switch")
>
> Specifically because SMAP uses FLAGS.AC which invalidates the claim
> that the kernel has clean flags.
>
> In particular; while preemption from interrupt return is fine (the
> IRET frame on the exception stack contains FLAGS) it breaks any code
> that does synchonous scheduling, including preempt_enable().
>
> This has become a significant issue ever since commit:
>
>   5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses")
>
> provided for means of having 'normal' C code between STAC / CLAC,
> exposing the FLAGS.AC state. So far this hasn't led to trouble,
> however fix it before it comes apart.
>
> Fixes: 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched accesses")
> Acked-by: Andy Lutomirski <luto@amacapital.net>
> Reported-by: Julien Thierry <julien.thierry@arm.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/entry/entry_32.S        |    2 ++
>  arch/x86/entry/entry_64.S        |    2 ++
>  arch/x86/include/asm/switch_to.h |    1 +
>  3 files changed, 5 insertions(+)
>
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -650,6 +650,7 @@ ENTRY(__switch_to_asm)
>         pushl   %ebx
>         pushl   %edi
>         pushl   %esi
> +       pushfl
>
>         /* switch stack */
>         movl    %esp, TASK_threadsp(%eax)
> @@ -672,6 +673,7 @@ ENTRY(__switch_to_asm)
>  #endif
>
>         /* restore callee-saved registers */
> +       popfl
>         popl    %esi
>         popl    %edi
>         popl    %ebx
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -291,6 +291,7 @@ ENTRY(__switch_to_asm)
>         pushq   %r13
>         pushq   %r14
>         pushq   %r15
> +       pushfq
>
>         /* switch stack */
>         movq    %rsp, TASK_threadsp(%rdi)
> @@ -313,6 +314,7 @@ ENTRY(__switch_to_asm)
>  #endif
>
>         /* restore callee-saved registers */
> +       popfq
>         popq    %r15
>         popq    %r14
>         popq    %r13
> --- a/arch/x86/include/asm/switch_to.h
> +++ b/arch/x86/include/asm/switch_to.h
> @@ -40,6 +40,7 @@ asmlinkage void ret_from_fork(void);
>   * order of the fields must match the code in __switch_to_asm().
>   */
>  struct inactive_task_frame {
> +       unsigned long flags;
>  #ifdef CONFIG_X86_64
>         unsigned long r15;
>         unsigned long r14;

flags should be initialized in copy_thread_tls().  I think the new
stack is zeroed already, but it would be better to explicitly set it.

--
Brian Gerst
Brian Gerst Feb. 15, 2019, 2:34 p.m. UTC | #2
On Thu, Feb 14, 2019 at 2:34 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Feb 14, 2019 at 11:18:01AM -0500, Brian Gerst wrote:
>
> > > --- a/arch/x86/include/asm/switch_to.h
> > > +++ b/arch/x86/include/asm/switch_to.h
> > > @@ -40,6 +40,7 @@ asmlinkage void ret_from_fork(void);
> > >   * order of the fields must match the code in __switch_to_asm().
> > >   */
> > >  struct inactive_task_frame {
> > > +       unsigned long flags;
> > >  #ifdef CONFIG_X86_64
> > >         unsigned long r15;
> > >         unsigned long r14;
> >
> > flags should be initialized in copy_thread_tls().  I think the new
> > stack is zeroed already, but it would be better to explicitly set it.
>
> Ah indeed. I somehow misread that code and thought we got initialized
> with a copy of current.
>
> Something like the below, right?
>
> ---
>
> --- a/arch/x86/kernel/process_32.c
> +++ b/arch/x86/kernel/process_32.c
> @@ -127,6 +127,7 @@ int copy_thread_tls(unsigned long clone_
>         struct task_struct *tsk;
>         int err;
>
> +       frame->flags = 0;
>         frame->bp = 0;
>         frame->ret_addr = (unsigned long) ret_from_fork;
>         p->thread.sp = (unsigned long) fork_frame;
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -392,6 +392,7 @@ int copy_thread_tls(unsigned long clone_
>         childregs = task_pt_regs(p);
>         fork_frame = container_of(childregs, struct fork_frame, regs);
>         frame = &fork_frame->frame;
> +       frame->flags = 0;
>         frame->bp = 0;
>         frame->ret_addr = (unsigned long) ret_from_fork;
>         p->thread.sp = (unsigned long) fork_frame;

Yes, this looks good.

--
Brian Gerst
Linus Torvalds Feb. 15, 2019, 5:18 p.m. UTC | #3
On Thu, Feb 14, 2019 at 11:34 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> Something like the below, right?
>
> +       frame->flags = 0;
> +       frame->flags = 0;

Those are not valid flag values.

Can you popf them? Yes.

Do they make sense? No.

It has the IF flag clear, for example. Is that intentional? If it is,
it should likely be documented. Because IF clear means "interrupts
disabled". Are the places that load flags in irq disabled regions?
Maybe they are, maybe they aren't, but shouldn't this be something
that is made explicit, rather than "oh, let's initialize to zero like
all the other registers we don't care about".

Because a zero initializer for eflags really is odd.

             Linus
Peter Zijlstra Feb. 15, 2019, 5:40 p.m. UTC | #4
On Fri, Feb 15, 2019 at 09:18:00AM -0800, Linus Torvalds wrote:
> On Thu, Feb 14, 2019 at 11:34 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > Something like the below, right?
> >
> > +       frame->flags = 0;
> > +       frame->flags = 0;
> 
> Those are not valid flag values.
> 
> Can you popf them? Yes.
> 
> Do they make sense? No.
> 
> It has the IF flag clear, for example. Is that intentional? If it is,

Uhmm. yeah, that's bonkers. We should have interrupts disabled here.
I'll go read up on the eflags and figure out what they _should_ be right
about there.
Andy Lutomirski Feb. 15, 2019, 6:28 p.m. UTC | #5
> On Feb 15, 2019, at 9:40 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
>> On Fri, Feb 15, 2019 at 09:18:00AM -0800, Linus Torvalds wrote:
>>> On Thu, Feb 14, 2019 at 11:34 AM Peter Zijlstra <peterz@infradead.org> wrote:
>>> 
>>> Something like the below, right?
>>> 
>>> +       frame->flags = 0;
>>> +       frame->flags = 0;
>> 
>> Those are not valid flag values.
>> 
>> Can you popf them? Yes.
>> 
>> Do they make sense? No.
>> 
>> It has the IF flag clear, for example. Is that intentional? If it is,
> 
> Uhmm. yeah, that's bonkers. We should have interrupts disabled here.
> I'll go read up on the eflags and figure out what they _should_ be right
> about there.

And probably add a comment near the POPF explaining that it will keep IRQs off :)
Peter Zijlstra Feb. 15, 2019, 11:34 p.m. UTC | #6
On Fri, Feb 15, 2019 at 06:40:34PM +0100, Peter Zijlstra wrote:
> On Fri, Feb 15, 2019 at 09:18:00AM -0800, Linus Torvalds wrote:
> > On Thu, Feb 14, 2019 at 11:34 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > Something like the below, right?
> > >
> > > +       frame->flags = 0;
> > > +       frame->flags = 0;
> > 
> > Those are not valid flag values.
> > 
> > Can you popf them? Yes.
> > 
> > Do they make sense? No.
> > 
> > It has the IF flag clear, for example. Is that intentional? If it is,
> 
> Uhmm. yeah, that's bonkers. We should have interrupts disabled here.
> I'll go read up on the eflags and figure out what they _should_ be right
> about there.

I misread (I'm forever confused about what way around IF goes), but you
said it right; IF=0 is interrupts disabled and we very much have that in
the middle of context switch.

(just for giggles I set IF for the initial flags value; and it comes
unstuck _real_ quick)

Now, EFLAGS bit 1 is supposedly always 1, but it really doesn't seem to
matter for POPF.

I went through the other flags, and aside from VIP/VIF (I've no clue),
they looks like 0 should be just fine.
Linus Torvalds Feb. 16, 2019, 12:21 a.m. UTC | #7
On Fri, Feb 15, 2019 at 3:34 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> Now, EFLAGS bit 1 is supposedly always 1, but it really doesn't seem to
> matter for POPF.

Correct, it's "read as 1", you can try to write it and it doesn't matter.

> I went through the other flags, and aside from VIP/VIF (I've no clue),
> they looks like 0 should be just fine.

So 0 is a perfectly valid initializer in the sense that it _works_, I
just want it to be something that was thought about, not just a random
"initialize to zero" without thinking.

Even just a comment about it would be fine. But it might also be good
to show that it's an explicit eflags value and just use
X86_EFLAGS_FIXED as the initializer.

            Linus
H. Peter Anvin Feb. 16, 2019, 4:06 a.m. UTC | #8
On February 14, 2019 2:14:29 AM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Wed, Feb 13, 2019 at 02:49:47PM -0800, Andy Lutomirski wrote:
>
>> Do we need to backport this thing?
>
>Possibly, just to be safe.
>
>> The problem can’t be too widespread or we would have heard of it
>before.
>
>Yes, so far we've been lucky.
>
>---
>Subject: sched/x86: Save [ER]FLAGS on context switch
>From: Peter Zijlstra <peterz@infradead.org>
>Date: Thu Feb 14 10:30:52 CET 2019
>
>Effectively revert commit:
>
>  2c7577a75837 ("sched/x86_64: Don't save flags on context switch")
>
>Specifically because SMAP uses FLAGS.AC which invalidates the claim
>that the kernel has clean flags.
>
>In particular; while preemption from interrupt return is fine (the
>IRET frame on the exception stack contains FLAGS) it breaks any code
>that does synchonous scheduling, including preempt_enable().
>
>This has become a significant issue ever since commit:
>
>5b24a7a2aa20 ("Add 'unsafe' user access functions for batched
>accesses")
>
>provided for means of having 'normal' C code between STAC / CLAC,
>exposing the FLAGS.AC state. So far this hasn't led to trouble,
>however fix it before it comes apart.
>
>Fixes: 5b24a7a2aa20 ("Add 'unsafe' user access functions for batched
>accesses")
>Acked-by: Andy Lutomirski <luto@amacapital.net>
>Reported-by: Julien Thierry <julien.thierry@arm.com>
>Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>---
> arch/x86/entry/entry_32.S        |    2 ++
> arch/x86/entry/entry_64.S        |    2 ++
> arch/x86/include/asm/switch_to.h |    1 +
> 3 files changed, 5 insertions(+)
>
>--- a/arch/x86/entry/entry_32.S
>+++ b/arch/x86/entry/entry_32.S
>@@ -650,6 +650,7 @@ ENTRY(__switch_to_asm)
> 	pushl	%ebx
> 	pushl	%edi
> 	pushl	%esi
>+	pushfl
> 
> 	/* switch stack */
> 	movl	%esp, TASK_threadsp(%eax)
>@@ -672,6 +673,7 @@ ENTRY(__switch_to_asm)
> #endif
> 
> 	/* restore callee-saved registers */
>+	popfl
> 	popl	%esi
> 	popl	%edi
> 	popl	%ebx
>--- a/arch/x86/entry/entry_64.S
>+++ b/arch/x86/entry/entry_64.S
>@@ -291,6 +291,7 @@ ENTRY(__switch_to_asm)
> 	pushq	%r13
> 	pushq	%r14
> 	pushq	%r15
>+	pushfq
> 
> 	/* switch stack */
> 	movq	%rsp, TASK_threadsp(%rdi)
>@@ -313,6 +314,7 @@ ENTRY(__switch_to_asm)
> #endif
> 
> 	/* restore callee-saved registers */
>+	popfq
> 	popq	%r15
> 	popq	%r14
> 	popq	%r13
>--- a/arch/x86/include/asm/switch_to.h
>+++ b/arch/x86/include/asm/switch_to.h
>@@ -40,6 +40,7 @@ asmlinkage void ret_from_fork(void);
>  * order of the fields must match the code in __switch_to_asm().
>  */
> struct inactive_task_frame {
>+	unsigned long flags;
> #ifdef CONFIG_X86_64
> 	unsigned long r15;
> 	unsigned long r14;

This implies we invoke schedule -- a restricted operation (consider may_sleep) during execution of STAC-enabled code, but *not* as an exception or interrupt, since those preserve the flags.

I have serious concerns about this. This is more or less saying that we have left an unlimited gap and have had AC escape.

Does *anyone* see a need to allow this? I got a question at LPC from someone about this, and what they were trying to do once all the layers had been unwound was so far down the wrong track for a root problem that actually has a very simple solution.
Peter Zijlstra Feb. 16, 2019, 10:30 a.m. UTC | #9
On Fri, Feb 15, 2019 at 08:06:56PM -0800, hpa@zytor.com wrote:
> This implies we invoke schedule -- a restricted operation (consider
> may_sleep) during execution of STAC-enabled code, but *not* as an
> exception or interrupt, since those preserve the flags.

Meet preempt_enable().

> I have serious concerns about this. This is more or less saying that
> we have left an unlimited gap and have had AC escape.

Yes; by allowing normal C in between STAC and CLAC this is so.

> Does *anyone* see a need to allow this? I got a question at LPC from
> someone about this, and what they were trying to do once all the
> layers had been unwound was so far down the wrong track for a root
> problem that actually has a very simple solution.

Have you read the rest of the thread?

All it takes for this to explode is a call to a C function that has
tracing on it in between the user_access_begin() and user_access_end()
things. That is a _very_ easy thing to do.

Heck, GCC is allowed to generate that broken code compiling
lib/strnlen_user.c; it doesn't _have_ to inline do_strnlen_user (esp.
with CONFIG_OPTIMIZE_INLINING), and making that a function call would
get us fentry hooks and ftrace and *BOOM*.

(Now, of course, its a static function with a single caller, and GCC
isn't _THAT_ stupid, but it could, if it wanted to)

Since the bar is _that_ low for mistakes, I figure we'd better fix it.
Peter Zijlstra Feb. 16, 2019, 10:32 a.m. UTC | #10
On Fri, Feb 15, 2019 at 04:21:46PM -0800, Linus Torvalds wrote:
> Even just a comment about it would be fine. But it might also be good
> to show that it's an explicit eflags value and just use
> X86_EFLAGS_FIXED as the initializer.

That is in fact what I have now; I'll repost on Monday or so.
H. Peter Anvin Feb. 18, 2019, 10:30 p.m. UTC | #11
On 2/16/19 2:30 AM, Peter Zijlstra wrote:
> On Fri, Feb 15, 2019 at 08:06:56PM -0800, hpa@zytor.com wrote:
>> This implies we invoke schedule -- a restricted operation (consider
>> may_sleep) during execution of STAC-enabled code, but *not* as an
>> exception or interrupt, since those preserve the flags.
> 
> Meet preempt_enable().

I believe this falls under "doctor, it hurts when I do that." And it hurts for
very good reasons. See below.

>> I have serious concerns about this. This is more or less saying that
>> we have left an unlimited gap and have had AC escape.
> 
> Yes; by allowing normal C in between STAC and CLAC this is so.
> 
>> Does *anyone* see a need to allow this? I got a question at LPC from
>> someone about this, and what they were trying to do once all the
>> layers had been unwound was so far down the wrong track for a root
>> problem that actually has a very simple solution.
> 
> Have you read the rest of the thread?
> 
> All it takes for this to explode is a call to a C function that has
> tracing on it in between the user_access_begin() and user_access_end()
> things. That is a _very_ easy thing to do.
> 
> Heck, GCC is allowed to generate that broken code compiling
> lib/strnlen_user.c; it doesn't _have_ to inline do_strnlen_user (esp.
> with CONFIG_OPTIMIZE_INLINING), and making that a function call would
> get us fentry hooks and ftrace and *BOOM*.
> 
> (Now, of course, its a static function with a single caller, and GCC
> isn't _THAT_ stupid, but it could, if it wanted to)
> 
> Since the bar is _that_ low for mistakes, I figure we'd better fix it.
> 

The question is what "fix it" means. I'm really concerned about AC escapes,
and everyone else should be, too.

For an issue specific to tracing, it would be more appropriate to do the
appropriate things in the tracing entry/exit than in schedule. Otherwise, we
don't want to silently paper over mistakes which could mean that we run a
large portion of the kernel without protection we thought we had.

In that sense, calling schedule() with AC set is in fact a great place to have
a WARN() or even BUG(), because such an event really could imply that the
kernel has been security compromised.

Does that make more sense?

	-hpa
Linus Torvalds Feb. 19, 2019, 12:24 a.m. UTC | #12
On Mon, Feb 18, 2019 at 2:31 PM H. Peter Anvin <hpa@zytor.com> wrote:
>
> The question is what "fix it" means. I'm really concerned about AC escapes,
> and everyone else should be, too.

I do think that it might be the right thing to do to add some kind of
WARN_ON_ONCE() for AC being set in various can-reschedule situations.

We'd just have to abstract it sanely. I'm sure arm64 has the exact
same issue with PAN - maybe it saves properly, but the same "we
wouldn't want to go through the scheduler with PAN clear".

On x86, we might as well check DF at the same time as AC.

              Linus
Andy Lutomirski Feb. 19, 2019, 2:20 a.m. UTC | #13
> On Feb 18, 2019, at 4:24 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
>> On Mon, Feb 18, 2019 at 2:31 PM H. Peter Anvin <hpa@zytor.com> wrote:
>> 
>> The question is what "fix it" means. I'm really concerned about AC escapes,
>> and everyone else should be, too.
> 
> I do think that it might be the right thing to do to add some kind of
> WARN_ON_ONCE() for AC being set in various can-reschedule situations.
> 
> We'd just have to abstract it sanely. I'm sure arm64 has the exact
> same issue with PAN - maybe it saves properly, but the same "we
> wouldn't want to go through the scheduler with PAN clear".
> 
> On x86, we might as well check DF at the same time as AC.
> 
> 

hpa is right, though — calling into tracing code with AC set is not really so good.  And calling schedule() (via preempt_enable() or whatever) is also bad because it runs all the scheduler code with AC on.  Admittedly, the scheduler is not *that* interesting of an attack surface.
H. Peter Anvin Feb. 19, 2019, 2:46 a.m. UTC | #14
On 2/18/19 6:20 PM, Andy Lutomirski wrote:
> 
> 
>> On Feb 18, 2019, at 4:24 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>>
>>> On Mon, Feb 18, 2019 at 2:31 PM H. Peter Anvin <hpa@zytor.com> wrote:
>>>
>>> The question is what "fix it" means. I'm really concerned about AC escapes,
>>> and everyone else should be, too.
>>
>> I do think that it might be the right thing to do to add some kind of
>> WARN_ON_ONCE() for AC being set in various can-reschedule situations.
>>
>> We'd just have to abstract it sanely. I'm sure arm64 has the exact
>> same issue with PAN - maybe it saves properly, but the same "we
>> wouldn't want to go through the scheduler with PAN clear".
>>
>> On x86, we might as well check DF at the same time as AC.
>>
> 
> hpa is right, though — calling into tracing code with AC set is not really so good.  And calling schedule() (via preempt_enable() or whatever) is also bad because it runs all the scheduler code with AC on.  Admittedly, the scheduler is not *that* interesting of an attack surface.
> 

Not just that, but the other question is just how much code we are running
with AC open. It really should only be done in some very small regions.

	-hpa
Julien Thierry Feb. 19, 2019, 8:53 a.m. UTC | #15
On 19/02/2019 00:24, Linus Torvalds wrote:
> On Mon, Feb 18, 2019 at 2:31 PM H. Peter Anvin <hpa@zytor.com> wrote:
>>
>> The question is what "fix it" means. I'm really concerned about AC escapes,
>> and everyone else should be, too.
> 
> I do think that it might be the right thing to do to add some kind of
> WARN_ON_ONCE() for AC being set in various can-reschedule situations.
> 
> We'd just have to abstract it sanely. I'm sure arm64 has the exact
> same issue with PAN - maybe it saves properly, but the same "we
> wouldn't want to go through the scheduler with PAN clear".
> 

As of right now, we have the same issue on arm64 as on x86. We don't
currently save the PAN bit on task switch, but I have a patch to do that.

Unless we decide to go down the route of warning against uses of
schedule() inside.

As for the abstraction, I had this patch[1] that added another primitive
for the user_access API (although this might not be suited for x86 if
you also want to check DF). However, an issue that appears is where to
perform the check to cover enough ground.

Checking inside the schedule() code you only cover cases where things
have already gone wrong, and not the use of functions that are unsafe to
call inside a user_access region.

[1]
http://lists.infradead.org/pipermail/linux-arm-kernel/2019-January/625385.html

Cheers,
Peter Zijlstra Feb. 19, 2019, 9:04 a.m. UTC | #16
On Mon, Feb 18, 2019 at 02:30:21PM -0800, H. Peter Anvin wrote:
> On 2/16/19 2:30 AM, Peter Zijlstra wrote:
> > On Fri, Feb 15, 2019 at 08:06:56PM -0800, hpa@zytor.com wrote:
> >> This implies we invoke schedule -- a restricted operation (consider
> >> may_sleep) during execution of STAC-enabled code, but *not* as an
> >> exception or interrupt, since those preserve the flags.
> > 
> > Meet preempt_enable().
> 
> I believe this falls under "doctor, it hurts when I do that." And it hurts for
> very good reasons. See below.

I disagree; the basic rule is that if you're preemptible you must also
be able to schedule and vice-versa. These AC regions violate this.

And, like illustrated, it is _very_ easy to get all sorts inside that AC
crap.

> >> I have serious concerns about this. This is more or less saying that
> >> we have left an unlimited gap and have had AC escape.
> > 
> > Yes; by allowing normal C in between STAC and CLAC this is so.
> > 
> >> Does *anyone* see a need to allow this? I got a question at LPC from
> >> someone about this, and what they were trying to do once all the
> >> layers had been unwound was so far down the wrong track for a root
> >> problem that actually has a very simple solution.
> > 
> > Have you read the rest of the thread?
> > 
> > All it takes for this to explode is a call to a C function that has
> > tracing on it in between the user_access_begin() and user_access_end()
> > things. That is a _very_ easy thing to do.
> > 
> > Heck, GCC is allowed to generate that broken code compiling
> > lib/strnlen_user.c; it doesn't _have_ to inline do_strnlen_user (esp.
> > with CONFIG_OPTIMIZE_INLINING), and making that a function call would
> > get us fentry hooks and ftrace and *BOOM*.
> > 
> > (Now, of course, its a static function with a single caller, and GCC
> > isn't _THAT_ stupid, but it could, if it wanted to)
> > 
> > Since the bar is _that_ low for mistakes, I figure we'd better fix it.
> > 
> 
> The question is what "fix it" means. 

It means that when we do schedule, the next task doesn't have AC set,
and when we schedule back, we'll restore our AC when we had it. Unlike
the current situation, where the next task will run with AC set.

IOW, it confines AC to the task context where it was set.

> I'm really concerned about AC escapes,
> and everyone else should be, too.

Then _that_ should be asserted.

> For an issue specific to tracing, it would be more appropriate to do the
> appropriate things in the tracing entry/exit than in schedule. Otherwise, we
> don't want to silently paper over mistakes which could mean that we run a
> large portion of the kernel without protection we thought we had.
> 
> In that sense, calling schedule() with AC set is in fact a great place to have
> a WARN() or even BUG(), because such an event really could imply that the
> kernel has been security compromised.

It is not specific to tracing, tracing is just one of the most trivial
and non-obvious ways to make it go splat.

There's lot of fairly innocent stuff that does preempt_disable() /
preempt_enable(). And just a warning in schedule() isn't sufficient,
you'd have to actually trigger a reschedule before you know your code is
bad.

And like I said; the invariant is: if you're preemptible you can
schedule and v.v.

Now, clearly you don't want to mark these whole regions as !preemptible,
because then you can also not take faults, but somehow you're not
worried about the whole fault handler, but you are worried about the
scheduler ?!? How does that work? The fault handler can touch a _ton_
more code. Heck, the fault handler can schedule.

So either pre-fault, and run the whole AC crap with preemption disabled
and retry, or accept that we have to schedule.

> Does that make more sense?

It appears to me you're going about it backwards.
Julien Thierry Feb. 19, 2019, 9:07 a.m. UTC | #17
On 19/02/2019 02:46, H. Peter Anvin wrote:
> On 2/18/19 6:20 PM, Andy Lutomirski wrote:
>>
>>
>>> On Feb 18, 2019, at 4:24 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote:
>>>
>>>> On Mon, Feb 18, 2019 at 2:31 PM H. Peter Anvin <hpa@zytor.com> wrote:
>>>>
>>>> The question is what "fix it" means. I'm really concerned about AC escapes,
>>>> and everyone else should be, too.
>>>
>>> I do think that it might be the right thing to do to add some kind of
>>> WARN_ON_ONCE() for AC being set in various can-reschedule situations.
>>>
>>> We'd just have to abstract it sanely. I'm sure arm64 has the exact
>>> same issue with PAN - maybe it saves properly, but the same "we
>>> wouldn't want to go through the scheduler with PAN clear".
>>>
>>> On x86, we might as well check DF at the same time as AC.
>>>
>>
>> hpa is right, though — calling into tracing code with AC set is not really so good.  And calling schedule() (via preempt_enable() or whatever) is also bad because it runs all the scheduler code with AC on.  Admittedly, the scheduler is not *that* interesting of an attack surface.
>>
> 
> Not just that, but the other question is just how much code we are running
> with AC open. It really should only be done in some very small regions.

Yes, but we don't really have a way to enforce that, as far as I'm aware.

The user_access_begin/end() is generic API, meaning any arch is free to
implement it. If they don't have the same hardware behaviour as
x86/arm64, it might be that their interrupt/exception entry code will
run with user_access open until they reach the entry code that closes it
(and entry code could potentially be a more interesting attack surface
than the scheduler). This could be the case of software emulated PAN on
arm/arm64 (although currently arm, non-64bit, doesn't have
user_access_begin/end() at the time).

So the whole "very small region" restriction sounds a bit
loose/arbitrary to me...

Thanks,
Peter Zijlstra Feb. 19, 2019, 9:15 a.m. UTC | #18
On Mon, Feb 18, 2019 at 04:24:30PM -0800, Linus Torvalds wrote:
> On Mon, Feb 18, 2019 at 2:31 PM H. Peter Anvin <hpa@zytor.com> wrote:
> >
> > The question is what "fix it" means. I'm really concerned about AC escapes,
> > and everyone else should be, too.
> 
> I do think that it might be the right thing to do to add some kind of
> WARN_ON_ONCE() for AC being set in various can-reschedule situations.

So I disagree.

Either we set AC with preempt disabled, and then we don't need an extra
warning, because we already have a warning about scheduling with
preemption disabled, or we accept that the fault handler can run.
Peter Zijlstra Feb. 19, 2019, 9:19 a.m. UTC | #19
On Tue, Feb 19, 2019 at 10:15:25AM +0100, Peter Zijlstra wrote:
> On Mon, Feb 18, 2019 at 04:24:30PM -0800, Linus Torvalds wrote:
> > On Mon, Feb 18, 2019 at 2:31 PM H. Peter Anvin <hpa@zytor.com> wrote:
> > >
> > > The question is what "fix it" means. I'm really concerned about AC escapes,
> > > and everyone else should be, too.
> > 
> > I do think that it might be the right thing to do to add some kind of
> > WARN_ON_ONCE() for AC being set in various can-reschedule situations.
> 
> So I disagree.
> 
> Either we set AC with preempt disabled, and then we don't need an extra
> warning, because we already have a warning about scheduling with
> preemption disabled, or we accept that the fault handler can run.

n/m about the faults, forgot the obvious :/

I still really dislike wrecking the preemption model over this.
H. Peter Anvin Feb. 19, 2019, 9:21 a.m. UTC | #20
On February 19, 2019 1:04:09 AM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Mon, Feb 18, 2019 at 02:30:21PM -0800, H. Peter Anvin wrote:
>> On 2/16/19 2:30 AM, Peter Zijlstra wrote:
>> > On Fri, Feb 15, 2019 at 08:06:56PM -0800, hpa@zytor.com wrote:
>> >> This implies we invoke schedule -- a restricted operation
>(consider
>> >> may_sleep) during execution of STAC-enabled code, but *not* as an
>> >> exception or interrupt, since those preserve the flags.
>> > 
>> > Meet preempt_enable().
>> 
>> I believe this falls under "doctor, it hurts when I do that." And it
>hurts for
>> very good reasons. See below.
>
>I disagree; the basic rule is that if you're preemptible you must also
>be able to schedule and vice-versa. These AC regions violate this.
>
>And, like illustrated, it is _very_ easy to get all sorts inside that
>AC
>crap.
>
>> >> I have serious concerns about this. This is more or less saying
>that
>> >> we have left an unlimited gap and have had AC escape.
>> > 
>> > Yes; by allowing normal C in between STAC and CLAC this is so.
>> > 
>> >> Does *anyone* see a need to allow this? I got a question at LPC
>from
>> >> someone about this, and what they were trying to do once all the
>> >> layers had been unwound was so far down the wrong track for a root
>> >> problem that actually has a very simple solution.
>> > 
>> > Have you read the rest of the thread?
>> > 
>> > All it takes for this to explode is a call to a C function that has
>> > tracing on it in between the user_access_begin() and
>user_access_end()
>> > things. That is a _very_ easy thing to do.
>> > 
>> > Heck, GCC is allowed to generate that broken code compiling
>> > lib/strnlen_user.c; it doesn't _have_ to inline do_strnlen_user
>(esp.
>> > with CONFIG_OPTIMIZE_INLINING), and making that a function call
>would
>> > get us fentry hooks and ftrace and *BOOM*.
>> > 
>> > (Now, of course, its a static function with a single caller, and
>GCC
>> > isn't _THAT_ stupid, but it could, if it wanted to)
>> > 
>> > Since the bar is _that_ low for mistakes, I figure we'd better fix
>it.
>> > 
>> 
>> The question is what "fix it" means. 
>
>It means that when we do schedule, the next task doesn't have AC set,
>and when we schedule back, we'll restore our AC when we had it. Unlike
>the current situation, where the next task will run with AC set.
>
>IOW, it confines AC to the task context where it was set.
>
>> I'm really concerned about AC escapes,
>> and everyone else should be, too.
>
>Then _that_ should be asserted.
>
>> For an issue specific to tracing, it would be more appropriate to do
>the
>> appropriate things in the tracing entry/exit than in schedule.
>Otherwise, we
>> don't want to silently paper over mistakes which could mean that we
>run a
>> large portion of the kernel without protection we thought we had.
>> 
>> In that sense, calling schedule() with AC set is in fact a great
>place to have
>> a WARN() or even BUG(), because such an event really could imply that
>the
>> kernel has been security compromised.
>
>It is not specific to tracing, tracing is just one of the most trivial
>and non-obvious ways to make it go splat.
>
>There's lot of fairly innocent stuff that does preempt_disable() /
>preempt_enable(). And just a warning in schedule() isn't sufficient,
>you'd have to actually trigger a reschedule before you know your code
>is
>bad.
>
>And like I said; the invariant is: if you're preemptible you can
>schedule and v.v.
>
>Now, clearly you don't want to mark these whole regions as
>!preemptible,
>because then you can also not take faults, but somehow you're not
>worried about the whole fault handler, but you are worried about the
>scheduler ?!? How does that work? The fault handler can touch a _ton_
>more code. Heck, the fault handler can schedule.
>
>So either pre-fault, and run the whole AC crap with preemption disabled
>and retry, or accept that we have to schedule.
>
>> Does that make more sense?
>
>It appears to me you're going about it backwards.

I'm not worried about the fault handler, because AC is always presented/disabled on exception entry; otherwise I would of course be very concerned.

To be clear: I'm not worried about the scheduler itself. I'm worried about how much code we have gone through to get there. Perhaps the scheduler itself is not the right point to probe for it, but we do need to catch things that have gone wrong, rather than just leaving the door wide open.

I would personally be far more comfortable saying you have to disable preemption in user access regions. It may be an unnecessary constraint for x86 and ARM64, but it is safe.

And Julien, yes, it is "somewhat arbitrary", but so are many engineering tradeoffs. Not everything has a very sharply definable line.
Peter Zijlstra Feb. 19, 2019, 9:44 a.m. UTC | #21
On Tue, Feb 19, 2019 at 10:04:09AM +0100, Peter Zijlstra wrote:
> > Does that make more sense?
> 
> It appears to me you're going about it backwards.

So how about you do a GCC plugin that verifies limits on code-gen
between user_access_begin/user_access_end() ?

 - No CALL/RET
   - implies user_access_end() happens
   - implies no fentry hooks
 - No __preempt_count frobbing
 - No tracepoints
 - ...

That way you put the burden on the special code, not on the rest of the
kernel.
Thomas Gleixner Feb. 19, 2019, 11:38 a.m. UTC | #22
On Tue, 19 Feb 2019, Peter Zijlstra wrote:

> On Tue, Feb 19, 2019 at 10:04:09AM +0100, Peter Zijlstra wrote:
> > > Does that make more sense?
> > 
> > It appears to me you're going about it backwards.
> 
> So how about you do a GCC plugin that verifies limits on code-gen
> between user_access_begin/user_access_end() ?
> 
>  - No CALL/RET
>    - implies user_access_end() happens
>    - implies no fentry hooks
>  - No __preempt_count frobbing
>  - No tracepoints
>  - ...
> 
> That way you put the burden on the special code, not on the rest of the
> kernel.

And then you have kprobes ....
Peter Zijlstra Feb. 19, 2019, 11:58 a.m. UTC | #23
On Tue, Feb 19, 2019 at 12:38:42PM +0100, Thomas Gleixner wrote:
> On Tue, 19 Feb 2019, Peter Zijlstra wrote:
> 
> > On Tue, Feb 19, 2019 at 10:04:09AM +0100, Peter Zijlstra wrote:
> > > > Does that make more sense?
> > > 
> > > It appears to me you're going about it backwards.
> > 
> > So how about you do a GCC plugin that verifies limits on code-gen
> > between user_access_begin/user_access_end() ?
> > 
> >  - No CALL/RET
> >    - implies user_access_end() happens
> >    - implies no fentry hooks
> >  - No __preempt_count frobbing
> >  - No tracepoints
> >  - ...
> > 
> > That way you put the burden on the special code, not on the rest of the
> > kernel.
> 
> And then you have kprobes ....

They prod the INT3 byte and then take an exception, and exceptions are
'fine'.
Will Deacon Feb. 19, 2019, 12:48 p.m. UTC | #24
On Tue, Feb 19, 2019 at 10:04:09AM +0100, Peter Zijlstra wrote:
> On Mon, Feb 18, 2019 at 02:30:21PM -0800, H. Peter Anvin wrote:
> > On 2/16/19 2:30 AM, Peter Zijlstra wrote:
> > > On Fri, Feb 15, 2019 at 08:06:56PM -0800, hpa@zytor.com wrote:
> > >> This implies we invoke schedule -- a restricted operation (consider
> > >> may_sleep) during execution of STAC-enabled code, but *not* as an
> > >> exception or interrupt, since those preserve the flags.
> > > 
> > > Meet preempt_enable().
> > 
> > I believe this falls under "doctor, it hurts when I do that." And it hurts for
> > very good reasons. See below.
> 
> I disagree; the basic rule is that if you're preemptible you must also
> be able to schedule and vice-versa. These AC regions violate this.
> 
> And, like illustrated, it is _very_ easy to get all sorts inside that AC
> crap.
> 
> > >> I have serious concerns about this. This is more or less saying that
> > >> we have left an unlimited gap and have had AC escape.
> > > 
> > > Yes; by allowing normal C in between STAC and CLAC this is so.
> > > 
> > >> Does *anyone* see a need to allow this? I got a question at LPC from
> > >> someone about this, and what they were trying to do once all the
> > >> layers had been unwound was so far down the wrong track for a root
> > >> problem that actually has a very simple solution.
> > > 
> > > Have you read the rest of the thread?
> > > 
> > > All it takes for this to explode is a call to a C function that has
> > > tracing on it in between the user_access_begin() and user_access_end()
> > > things. That is a _very_ easy thing to do.
> > > 
> > > Heck, GCC is allowed to generate that broken code compiling
> > > lib/strnlen_user.c; it doesn't _have_ to inline do_strnlen_user (esp.
> > > with CONFIG_OPTIMIZE_INLINING), and making that a function call would
> > > get us fentry hooks and ftrace and *BOOM*.
> > > 
> > > (Now, of course, its a static function with a single caller, and GCC
> > > isn't _THAT_ stupid, but it could, if it wanted to)
> > > 
> > > Since the bar is _that_ low for mistakes, I figure we'd better fix it.
> > > 
> > 
> > The question is what "fix it" means. 
> 
> It means that when we do schedule, the next task doesn't have AC set,
> and when we schedule back, we'll restore our AC when we had it. Unlike
> the current situation, where the next task will run with AC set.
> 
> IOW, it confines AC to the task context where it was set.
> 
> > I'm really concerned about AC escapes,
> > and everyone else should be, too.
> 
> Then _that_ should be asserted.
> 
> > For an issue specific to tracing, it would be more appropriate to do the
> > appropriate things in the tracing entry/exit than in schedule. Otherwise, we
> > don't want to silently paper over mistakes which could mean that we run a
> > large portion of the kernel without protection we thought we had.
> > 
> > In that sense, calling schedule() with AC set is in fact a great place to have
> > a WARN() or even BUG(), because such an event really could imply that the
> > kernel has been security compromised.
> 
> It is not specific to tracing, tracing is just one of the most trivial
> and non-obvious ways to make it go splat.
> 
> There's lot of fairly innocent stuff that does preempt_disable() /
> preempt_enable(). And just a warning in schedule() isn't sufficient,
> you'd have to actually trigger a reschedule before you know your code is
> bad.
> 
> And like I said; the invariant is: if you're preemptible you can
> schedule and v.v.
> 
> Now, clearly you don't want to mark these whole regions as !preemptible,
> because then you can also not take faults, but somehow you're not
> worried about the whole fault handler, but you are worried about the
> scheduler ?!? How does that work? The fault handler can touch a _ton_
> more code. Heck, the fault handler can schedule.
> 
> So either pre-fault, and run the whole AC crap with preemption disabled
> and retry, or accept that we have to schedule.

I think you'll still hate this, but could we not disable preemption during
the uaccess-enabled region, re-enabling it on the fault path after we've
toggled uaccess off and disable it again when we return back to the
uaccess-enabled region? Doesn't help with tracing, but it should at least
handle the common case.

Will
H. Peter Anvin Feb. 20, 2019, 10:55 p.m. UTC | #25
On 2/19/19 4:48 AM, Will Deacon wrote:
> 
> I think you'll still hate this, but could we not disable preemption during
> the uaccess-enabled region, re-enabling it on the fault path after we've
> toggled uaccess off and disable it again when we return back to the
> uaccess-enabled region? Doesn't help with tracing, but it should at least
> handle the common case.
> 

There is a worse problem with this, I still realize: this would mean blocking
preemption across what could possibly be a *very* large copy_from_user(), for
example.

Exceptions *have* to handle this; there is no way around it. Perhaps the
scheduler isn't the right place to put these kinds of asserts, either.

Now, __fentry__ is kind of a special beast; in some ways it is an "exception
implemented as a function call"; on x86 one could even consider using an INT
instruction in order to reduce the NOP footprint in the unarmed case.  Nor is
__fentry__ a C function; it has far more of an exception-like ABI.
*Regardless* of what else we do, I believe __fentry__ ought to
save/disable/restore AC, just like an exception does.

The idea of using s/a gcc plugin/objtool/ for this isn't really a bad idea.
Obviously the general problem is undecidable :) but the enforcement of some
simple, fairly draconian rules ("as tight as possible, but no tighter")
shouldn't be a huge problem.

An actual gcc plugin -- which would probably be quite complex -- could make
gcc itself aware of user space accesses and be able to rearrange them to
minimize STAC/CLAC and avoid kernel-space accesses inside those brackets.

Finally, of course, there is the option of simply outlawing this practice as a
matter of policy and require that all structures be accessed through a limited
set of APIs. As I recall, the number of places where there were
performance-critical regions which could not use the normal accessors are
fairly small (signal frames being the main one.) Doing bulk copy to/from
kernel memory and then accessing them from there would have some performance
cost, but would eliminate the need for this complexity entirely.

	-hpa
Julien Thierry Feb. 21, 2019, 12:06 p.m. UTC | #26
On 20/02/2019 22:55, H. Peter Anvin wrote:
> On 2/19/19 4:48 AM, Will Deacon wrote:
>>
>> I think you'll still hate this, but could we not disable preemption during
>> the uaccess-enabled region, re-enabling it on the fault path after we've
>> toggled uaccess off and disable it again when we return back to the
>> uaccess-enabled region? Doesn't help with tracing, but it should at least
>> handle the common case.
>>
> 
> There is a worse problem with this, I still realize: this would mean blocking
> preemption across what could possibly be a *very* large copy_from_user(), for
> example.
> 

Yes, that's a good point. And I guess a userspace program could forcibly
trigger the kernel into a large copy_from/to_user(), knowingly disabling
preemption.

I don't know how bad this could get.

> Exceptions *have* to handle this; there is no way around it. Perhaps the
> scheduler isn't the right place to put these kinds of asserts, either.
> 

Maybe I'm misunderstanding what you mean with "Exceptions *have* to
handle this". Isn't the fact that the AC/PAN flags gets saved on
exception entry and set back to "user accesses disabled" already
handling it? Or are you referring to something else?

So far my understanding is that the exception/interrupt case is fine.
The worrying case is what gets called in the user access regions
(schedule(), preempt_enable(), tracing...).

Did I get lost somewhere?

> Now, __fentry__ is kind of a special beast; in some ways it is an "exception
> implemented as a function call"; on x86 one could even consider using an INT
> instruction in order to reduce the NOP footprint in the unarmed case.  Nor is
> __fentry__ a C function; it has far more of an exception-like ABI.
> *Regardless* of what else we do, I believe __fentry__ ought to
> save/disable/restore AC, just like an exception does.
> 

That does make sense to me. However it doesn't solve the issue of
calling (or preventing to call) some function that rescheds.


> The idea of using s/a gcc plugin/objtool/ for this isn't really a bad idea.
> Obviously the general problem is undecidable :) but the enforcement of some
> simple, fairly draconian rules ("as tight as possible, but no tighter")
> shouldn't be a huge problem.
> 
> An actual gcc plugin -- which would probably be quite complex -- could make
> gcc itself aware of user space accesses and be able to rearrange them to
> minimize STAC/CLAC and avoid kernel-space accesses inside those brackets.
> 

Not sure I get this. The data that is retrieved from/stored user space
is generally obtained from/saved into kernel-space address. And when you
start the user_access_begin() it means you plan on doing a bunch of user
access, so I wouldn't expect to be able to batch everything into registers.

Cheers,
Will Deacon Feb. 21, 2019, 12:46 p.m. UTC | #27
On Wed, Feb 20, 2019 at 02:55:59PM -0800, H. Peter Anvin wrote:
> On 2/19/19 4:48 AM, Will Deacon wrote:
> > 
> > I think you'll still hate this, but could we not disable preemption during
> > the uaccess-enabled region, re-enabling it on the fault path after we've
> > toggled uaccess off and disable it again when we return back to the
> > uaccess-enabled region? Doesn't help with tracing, but it should at least
> > handle the common case.
> > 
> 
> There is a worse problem with this, I still realize: this would mean blocking
> preemption across what could possibly be a *very* large copy_from_user(), for
> example.

I don't think it's legitimate to call copy_{to,from}_user() inside a
user_access_{begin,end} region. You'd need to add some unsafe variants,
which could periodically disable uaccess and call cond_resched() inside
the loop to avoid the problem you're eluding to.

For existing callers of copy_{to,from}_user(), there's no issue as they
don't call into the scheduler during the copy operation. Exceptions are
handled fine by the code in mainline today.

GCC plugins are a cool idea, but I'm just a bit nervous about relying on
them for things like this.

Will
Thomas Gleixner Feb. 21, 2019, 9:35 p.m. UTC | #28
On Thu, 21 Feb 2019, Julien Thierry wrote:
> On 20/02/2019 22:55, H. Peter Anvin wrote:
> > Now, __fentry__ is kind of a special beast; in some ways it is an "exception
> > implemented as a function call"; on x86 one could even consider using an INT
> > instruction in order to reduce the NOP footprint in the unarmed case.  Nor is
> > __fentry__ a C function; it has far more of an exception-like ABI.
> > *Regardless* of what else we do, I believe __fentry__ ought to
> > save/disable/restore AC, just like an exception does.
> > 
> 
> That does make sense to me. However it doesn't solve the issue of
> calling (or preventing to call) some function that rescheds.

IMNSHO any call inside a AC region is a bug lurking round the corner. The
only thing which is tolerable is an exception of some sort.

Enforce that with objtool. End of story.

Thanks,

	tglx
Andy Lutomirski Feb. 21, 2019, 10:06 p.m. UTC | #29
> On Feb 21, 2019, at 4:46 AM, Will Deacon <will.deacon@arm.com> wrote:
> 
>> On Wed, Feb 20, 2019 at 02:55:59PM -0800, H. Peter Anvin wrote:
>>> On 2/19/19 4:48 AM, Will Deacon wrote:
>>> 
>>> I think you'll still hate this, but could we not disable preemption during
>>> the uaccess-enabled region, re-enabling it on the fault path after we've
>>> toggled uaccess off and disable it again when we return back to the
>>> uaccess-enabled region? Doesn't help with tracing, but it should at least
>>> handle the common case.
>>> 
>> 
>> There is a worse problem with this, I still realize: this would mean blocking
>> preemption across what could possibly be a *very* large copy_from_user(), for
>> example.
> 
> I don't think it's legitimate to call copy_{to,from}_user() inside a
> user_access_{begin,end} region. You'd need to add some unsafe variants,
> which could periodically disable uaccess and call cond_resched() inside
> the loop to avoid the problem you're eluding to.
> 

Definitely not safe. On x86 they do CLAC and everything breaks.
Linus Torvalds Feb. 21, 2019, 10:08 p.m. UTC | #30
On Thu, Feb 21, 2019 at 1:35 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> IMNSHO any call inside a AC region is a bug lurking round the corner. The
> only thing which is tolerable is an exception of some sort.
>
> Enforce that with objtool. End of story.

Not quite that simple.

There are some use cases where you really do want a call - albeit to
special functions - inside the AC region.

The prime example of this is likely "filldir()" in fs/readdir.c, which
is actually somewhat important for some loads (eg samba).

And the whole AC thing made it horribly horribly slow because readdir
is one of those things that doesn't just copy one value (or one
structure) to user space, but writes several different things.

Kind of like signal frame setup does.

You may not realize that right now signal frame setup *does* actually
do a call with AC set. See ia32_setup_rt_frame() that does

        put_user_try {
...
                compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
...
        } put_user_catch(err);

is a macro, but inside that macro is a call to sas_ss_reset().

Now, that is an inline function, so it hopefully doesn't actually
cause a call. But it's "only" an inline function, not "always_inline".
We did already have (and I rejected, for now) patches that made those
inlines not very strong...

[ Side note: currently signal frame setup uses the absolutely
*disgusting* put_user_ex() thing, but that's actually what
unsafe_put_user() was designed for. I just never got around to getting
rid of the nasty hot mess of put_user_ex() ]

Anyway, while signal frame setup just writes individual values,
"filldir()" does *both* several individual values _and_ does a
copy_to_user().

And it's that "copy_to_user()" that I want to eventually change to a
"unsafe_copy_to_user()", along with making the individual values be
written with unsafe_put_user().

Right now "filldir()" messes with AC a total of *six* times. It sets
four field values, and then does a "copy_to_user()", all of which
set/clear AC right now. It's wasting hundreds of cycles on this,
because AC is so slow to set/clear.

If AC was cheap, this wouldn't be much of an issue. But AC is really
really expensive. I think it's microcode and serializes the pipeline
or something.

Anyway. We already have a possible call site, and there are good
reasons for future ones too. But they are hopefully all very
controlled.

                 Linus
Thomas Gleixner Feb. 22, 2019, 6:10 p.m. UTC | #31
On Thu, 21 Feb 2019, Linus Torvalds wrote:
> On Thu, Feb 21, 2019 at 1:35 PM Thomas Gleixner <tglx@linutronix.de> wrote:
> >
> > IMNSHO any call inside a AC region is a bug lurking round the corner. The
> > only thing which is tolerable is an exception of some sort.
> >
> > Enforce that with objtool. End of story.
> 
> Not quite that simple.

But correct :)

> Right now "filldir()" messes with AC a total of *six* times. It sets
> four field values, and then does a "copy_to_user()", all of which
> set/clear AC right now. It's wasting hundreds of cycles on this,
> because AC is so slow to set/clear.
> 
> If AC was cheap, this wouldn't be much of an issue. But AC is really
> really expensive. I think it's microcode and serializes the pipeline
> or something.
> 
> Anyway. We already have a possible call site, and there are good
> reasons for future ones too. But they are hopefully all very
> controlled.

I agree, that a function which is doing the actual copy should be callable,
but random other functions? NO!

The problem is that once you open that can of worms the people who think
their problem is special will come around and do

      begin()
      copy_to_user_unsafe(uptr, collect_data())
      end()

just because they can. That's the stuff, I'm worried about, not the well
defined copy_to/from_user() invocation. We can deal with that and make sure
that it's safe even with tracing and annotate with some special magic. It's
simpler to find and check a new '__safe_inside_ac' annotation than chasing
randomly added code within a gazillion of begin/end sections.

Thanks,

	tglx
Linus Torvalds Feb. 22, 2019, 11:34 p.m. UTC | #32
On Fri, Feb 22, 2019 at 2:26 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> So find the below patch -- which spotted fail in ptrace.c
>
> It has an AC_SAFE(func) annotation which allows marking specific
> functions as safe to call. The patch includes 2 instances which were
> required to make arch/x86 'build':

Looks sane enough to me.

Can you make it do DF too while at it? I really think AC and DF are
the same in this context. If you call an arbitrary function with DF
set, things will very quickly go sideways (even if it might work in
practice as long as the function just doesn't do a memcpy or similar)

              Linus
H. Peter Anvin Feb. 22, 2019, 11:39 p.m. UTC | #33
On February 22, 2019 2:26:35 PM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Fri, Feb 22, 2019 at 07:10:34PM +0100, Thomas Gleixner wrote:
>
>> But correct :)
>
>> I agree, that a function which is doing the actual copy should be
>callable,
>> but random other functions? NO!
>
>So find the below patch -- which spotted fail in ptrace.c
>
>It has an AC_SAFE(func) annotation which allows marking specific
>functions as safe to call. The patch includes 2 instances which were
>required to make arch/x86 'build':
>
>arch/x86/ia32/ia32_signal.o: warning: objtool:
>ia32_restore_sigcontext()+0x3d: call to native_load_gs_index() with AC
>set
>arch/x86/kernel/ptrace.o: warning: objtool: genregs_get()+0x8e: call to
>getreg() with AC set
>
>It also screams (provided one has CONFIG_FUNCTION_TRACE=y) about the
>lack of notrace annotations on functions marked AC_SAFE():
>
>arch/x86/kernel/ptrace.o: warning: objtool: getreg()+0x0: call to
>__fentry__() with AC set
>
>It builds arch/x86 relatively clean; it only complains about some
>redundant CLACs in entry_64.S because it doesn't understand interrupts
>and I've not bothered creating an annotation for them yet.
>
>arch/x86/entry/entry_64.o: warning: objtool:
>.altinstr_replacement+0x4d: redundant CLAC
>arch/x86/entry/entry_64.o: warning: objtool:
>.altinstr_replacement+0x5a: redundant CLAC
>  ...
>arch/x86/entry/entry_64.o: warning: objtool:
>.altinstr_replacement+0xb1: redundant CLAC
>
>Also, I realized we don't need special annotations for preempt_count;
>preempt_disable() emits a CALL instruction which should readily trigger
>the warnings added here.
>
>The VDSO thing is a bit of a hack, but I couldn't quickly find anything
>better.
>
>Comments?
>
>---
> arch/x86/include/asm/special_insns.h |  2 ++
> arch/x86/kernel/ptrace.c             |  3 +-
> include/linux/frame.h                | 23 ++++++++++++++
> tools/objtool/arch.h                 |  4 ++-
> tools/objtool/arch/x86/decode.c      | 14 ++++++++-
>tools/objtool/check.c                | 59
>++++++++++++++++++++++++++++++++++++
> tools/objtool/check.h                |  3 +-
> tools/objtool/elf.h                  |  1 +
> 8 files changed, 105 insertions(+), 4 deletions(-)
>
>diff --git a/arch/x86/include/asm/special_insns.h
>b/arch/x86/include/asm/special_insns.h
>index 43c029cdc3fe..cd31e4433f4c 100644
>--- a/arch/x86/include/asm/special_insns.h
>+++ b/arch/x86/include/asm/special_insns.h
>@@ -5,6 +5,7 @@
> 
> #ifdef __KERNEL__
> 
>+#include <linux/frame.h>
> #include <asm/nops.h>
> 
> /*
>@@ -135,6 +136,7 @@ static inline void native_wbinvd(void)
> }
> 
> extern asmlinkage void native_load_gs_index(unsigned);
>+AC_SAFE(native_load_gs_index);
> 
> static inline unsigned long __read_cr4(void)
> {
>diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
>index 4b8ee05dd6ad..e278b4055a8b 100644
>--- a/arch/x86/kernel/ptrace.c
>+++ b/arch/x86/kernel/ptrace.c
>@@ -420,7 +420,7 @@ static int putreg(struct task_struct *child,
> 	return 0;
> }
> 
>-static unsigned long getreg(struct task_struct *task, unsigned long
>offset)
>+static notrace unsigned long getreg(struct task_struct *task, unsigned
>long offset)
> {
> 	switch (offset) {
> 	case offsetof(struct user_regs_struct, cs):
>@@ -444,6 +444,7 @@ static unsigned long getreg(struct task_struct
>*task, unsigned long offset)
> 
> 	return *pt_regs_access(task_pt_regs(task), offset);
> }
>+AC_SAFE(getreg);
> 
> static int genregs_get(struct task_struct *target,
> 		       const struct user_regset *regset,
>diff --git a/include/linux/frame.h b/include/linux/frame.h
>index 02d3ca2d9598..5d354cf42a56 100644
>--- a/include/linux/frame.h
>+++ b/include/linux/frame.h
>@@ -21,4 +21,27 @@
> 
> #endif /* CONFIG_STACK_VALIDATION */
> 
>+#if defined(CONFIG_STACK_VALIDATION) && !defined(BUILD_VDSO) &&
>!defined(BUILD_VDSO32)
>+/*
>+ * This macro marks functions as AC-safe, that is, it is safe to call
>from an
>+ * EFLAGS.AC enabled region (typically user_access_begin() /
>+ * user_access_end()).
>+ *
>+ * These functions in turn will only call AC-safe functions themselves
>(which
>+ * precludes tracing, including __fentry__ and scheduling, including
>+ * preempt_enable).
>+ *
>+ * AC-safe functions will obviously also not change EFLAGS.AC
>themselves.
>+ *
>+ * Since STAC/CLAC are OPL-0 only, this is all irrelevant for VDSO
>builds
>+ * (and the generated symbol reference will in fact cause link
>failures).
>+ */
>+#define AC_SAFE(func) \
>+	static void __used __section(.discard.ac_safe) \
>+		*__func_ac_safe_##func = func
>+
>+#else
>+#define AC_SAFE(func)
>+#endif
>+
> #endif /* _LINUX_FRAME_H */
>diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h
>index b0d7dc3d71b5..48327099466d 100644
>--- a/tools/objtool/arch.h
>+++ b/tools/objtool/arch.h
>@@ -33,7 +33,9 @@
> #define INSN_STACK		8
> #define INSN_BUG		9
> #define INSN_NOP		10
>-#define INSN_OTHER		11
>+#define INSN_STAC		11
>+#define INSN_CLAC		12
>+#define INSN_OTHER		13
> #define INSN_LAST		INSN_OTHER
> 
> enum op_dest_type {
>diff --git a/tools/objtool/arch/x86/decode.c
>b/tools/objtool/arch/x86/decode.c
>index 540a209b78ab..d1e99d1460a5 100644
>--- a/tools/objtool/arch/x86/decode.c
>+++ b/tools/objtool/arch/x86/decode.c
>@@ -369,7 +369,19 @@ int arch_decode_instruction(struct elf *elf,
>struct section *sec,
> 
> 	case 0x0f:
> 
>-		if (op2 >= 0x80 && op2 <= 0x8f) {
>+		if (op2 == 0x01) {
>+
>+			if (modrm == 0xca) {
>+
>+				*type = INSN_CLAC;
>+
>+			} else if (modrm == 0xcb) {
>+
>+				*type = INSN_STAC;
>+
>+			}
>+
>+		} else if (op2 >= 0x80 && op2 <= 0x8f) {
> 
> 			*type = INSN_JUMP_CONDITIONAL;
> 
>diff --git a/tools/objtool/check.c b/tools/objtool/check.c
>index 0414a0d52262..01852602ca31 100644
>--- a/tools/objtool/check.c
>+++ b/tools/objtool/check.c
>@@ -127,6 +127,24 @@ static bool ignore_func(struct objtool_file *file,
>struct symbol *func)
> 	return false;
> }
> 
>+static bool ac_safe_func(struct objtool_file *file, struct symbol
>*func)
>+{
>+	struct rela *rela;
>+
>+	/* check for AC_SAFE */
>+	if (file->ac_safe && file->ac_safe->rela)
>+		list_for_each_entry(rela, &file->ac_safe->rela->rela_list, list) {
>+			if (rela->sym->type == STT_SECTION &&
>+			    rela->sym->sec == func->sec &&
>+			    rela->addend == func->offset)
>+				return true;
>+			if (/* rela->sym->type == STT_FUNC && */ rela->sym == func)
>+				return true;
>+		}
>+
>+	return false;
>+}
>+
> /*
>  * This checks to see if the given function is a "noreturn" function.
>  *
>@@ -439,6 +457,8 @@ static void add_ignores(struct objtool_file *file)
> 
> 	for_each_sec(file, sec) {
> 		list_for_each_entry(func, &sec->symbol_list, list) {
>+			func->ac_safe = ac_safe_func(file, func);
>+
> 			if (func->type != STT_FUNC)
> 				continue;
> 
>@@ -1902,6 +1922,11 @@ static int validate_branch(struct objtool_file
>*file, struct instruction *first,
> 		switch (insn->type) {
> 
> 		case INSN_RETURN:
>+			if (state.ac) {
>+				WARN_FUNC("return with AC set", sec, insn->offset);
>+				return 1;
>+			}
>+
> 			if (func && has_modified_stack_frame(&state)) {
> 				WARN_FUNC("return with modified stack frame",
> 					  sec, insn->offset);
>@@ -1917,6 +1942,12 @@ static int validate_branch(struct objtool_file
>*file, struct instruction *first,
> 			return 0;
> 
> 		case INSN_CALL:
>+			if ((state.ac_safe || state.ac) && !insn->call_dest->ac_safe) {
>+				WARN_FUNC("call to %s() with AC set", sec, insn->offset,
>+						insn->call_dest->name);
>+				return 1;
>+			}
>+
> 			if (is_fentry_call(insn))
> 				break;
> 
>@@ -1928,6 +1959,11 @@ static int validate_branch(struct objtool_file
>*file, struct instruction *first,
> 
> 			/* fallthrough */
> 		case INSN_CALL_DYNAMIC:
>+			if ((state.ac_safe || state.ac) && !insn->call_dest->ac_safe) {
>+				WARN_FUNC("call to %s() with AC set", sec, insn->offset,
>+						insn->call_dest->name);
>+				return 1;
>+			}
> 			if (!no_fp && func && !has_valid_stack_frame(&state)) {
> 				WARN_FUNC("call without frame pointer save/setup",
> 					  sec, insn->offset);
>@@ -1980,6 +2016,26 @@ static int validate_branch(struct objtool_file
>*file, struct instruction *first,
> 
> 			break;
> 
>+		case INSN_STAC:
>+			if (state.ac_safe || state.ac) {
>+				WARN_FUNC("recursive STAC", sec, insn->offset);
>+				return 1;
>+			}
>+			state.ac = true;
>+			break;
>+
>+		case INSN_CLAC:
>+			if (!state.ac) {
>+				WARN_FUNC("redundant CLAC", sec, insn->offset);
>+				return 1;
>+			}
>+			if (state.ac_safe) {
>+				WARN_FUNC("AC-safe clears AC", sec, insn->offset);
>+				return 1;
>+			}
>+			state.ac = false;
>+			break;
>+
> 		default:
> 			break;
> 		}
>@@ -2141,6 +2197,8 @@ static int validate_functions(struct objtool_file
>*file)
> 			if (!insn || insn->ignore)
> 				continue;
> 
>+			state.ac_safe = func->ac_safe;
>+
> 			ret = validate_branch(file, insn, state);
> 			warnings += ret;
> 		}
>@@ -2198,6 +2256,7 @@ int check(const char *_objname, bool orc)
> 	INIT_LIST_HEAD(&file.insn_list);
> 	hash_init(file.insn_hash);
>	file.whitelist = find_section_by_name(file.elf,
>".discard.func_stack_frame_non_standard");
>+	file.ac_safe = find_section_by_name(file.elf, ".discard.ac_safe");
> 	file.c_file = find_section_by_name(file.elf, ".comment");
> 	file.ignore_unreachables = no_unreachable;
> 	file.hints = false;
>diff --git a/tools/objtool/check.h b/tools/objtool/check.h
>index e6e8a655b556..c31ec3ca78f3 100644
>--- a/tools/objtool/check.h
>+++ b/tools/objtool/check.h
>@@ -31,7 +31,7 @@ struct insn_state {
> 	int stack_size;
> 	unsigned char type;
> 	bool bp_scratch;
>-	bool drap, end;
>+	bool drap, end, ac, ac_safe;
> 	int drap_reg, drap_offset;
> 	struct cfi_reg vals[CFI_NUM_REGS];
> };
>@@ -61,6 +61,7 @@ struct objtool_file {
> 	struct list_head insn_list;
> 	DECLARE_HASHTABLE(insn_hash, 16);
> 	struct section *whitelist;
>+	struct section *ac_safe;
> 	bool ignore_unreachables, c_file, hints, rodata;
> };
> 
>diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h
>index bc97ed86b9cd..064c3df31e40 100644
>--- a/tools/objtool/elf.h
>+++ b/tools/objtool/elf.h
>@@ -62,6 +62,7 @@ struct symbol {
> 	unsigned long offset;
> 	unsigned int len;
> 	struct symbol *pfunc, *cfunc;
>+	bool ac_safe;
> };
> 
> struct rela {

I like it.

Objtool could also detect CLAC-STAC or STAC-CLAC sequences without memory operations and remove them; don't know how often that happens, but I know it *does* happen.
Andy Lutomirski Feb. 22, 2019, 11:55 p.m. UTC | #34
On Fri, Feb 22, 2019 at 2:26 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Feb 22, 2019 at 07:10:34PM +0100, Thomas Gleixner wrote:
>
> > But correct :)
>
> > I agree, that a function which is doing the actual copy should be callable,
> > but random other functions? NO!
>
> So find the below patch -- which spotted fail in ptrace.c
>
> It has an AC_SAFE(func) annotation which allows marking specific
> functions as safe to call. The patch includes 2 instances which were
> required to make arch/x86 'build':
>
>   arch/x86/ia32/ia32_signal.o: warning: objtool: ia32_restore_sigcontext()+0x3d: call to native_load_gs_index() with AC set
>   arch/x86/kernel/ptrace.o: warning: objtool: genregs_get()+0x8e: call to getreg() with AC set
>
> It also screams (provided one has CONFIG_FUNCTION_TRACE=y) about the
> lack of notrace annotations on functions marked AC_SAFE():
>
>   arch/x86/kernel/ptrace.o: warning: objtool: getreg()+0x0: call to __fentry__() with AC set
>
> It builds arch/x86 relatively clean; it only complains about some
> redundant CLACs in entry_64.S because it doesn't understand interrupts
> and I've not bothered creating an annotation for them yet.
>
>   arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0x4d: redundant CLAC
>   arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0x5a: redundant CLAC
>   ...
>   arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0xb1: redundant CLAC

Does objtool understand altinstr?  If it understands that this is an
altinstr associated with a not-actually-a-fuction (i.e. END not
ENDPROC) piece of code, it could suppress this warning.

>
> -static unsigned long getreg(struct task_struct *task, unsigned long offset)
> +static notrace unsigned long getreg(struct task_struct *task, unsigned long offset)
>  {
>         switch (offset) {
>         case offsetof(struct user_regs_struct, cs):
> @@ -444,6 +444,7 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
>
>         return *pt_regs_access(task_pt_regs(task), offset);
>  }
> +AC_SAFE(getreg);

This takes the address and could plausibly suppress some optimizations.

>
> +#if defined(CONFIG_STACK_VALIDATION) && !defined(BUILD_VDSO) && !defined(BUILD_VDSO32)
> +/*
> + * This macro marks functions as AC-safe, that is, it is safe to call from an
> + * EFLAGS.AC enabled region (typically user_access_begin() /
> + * user_access_end()).
> + *
> + * These functions in turn will only call AC-safe functions themselves (which
> + * precludes tracing, including __fentry__ and scheduling, including
> + * preempt_enable).
> + *
> + * AC-safe functions will obviously also not change EFLAGS.AC themselves.
> + *
> + * Since STAC/CLAC are OPL-0 only, this is all irrelevant for VDSO builds
> + * (and the generated symbol reference will in fact cause link failures).
> + */
> +#define AC_SAFE(func) \
> +       static void __used __section(.discard.ac_safe) \
> +               *__func_ac_safe_##func = func

Are we okay with annotating function *declarations* in a way that
their addresses get taken whenever the declaration is processed?  It
would be nice if we could avoid this.

I'm wondering if we can just change the code that does getreg() and
load_gs_index() so it doesn't do it with AC set.  Also, what about
paravirt kernels?  They'll call into PV code for load_gs_index() with
AC set.

--Andy
Andy Lutomirski Feb. 23, 2019, 12:34 a.m. UTC | #35
[mailing lists removed because this is a potentially large source of exploits]

On Fri, Feb 22, 2019 at 2:26 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, Feb 22, 2019 at 07:10:34PM +0100, Thomas Gleixner wrote:
>
> > But correct :)
>
> > I agree, that a function which is doing the actual copy should be callable,
> > but random other functions? NO!
>
> So find the below patch -- which spotted fail in ptrace.c
>

Um, wait a moment.  You didn't find an oddity in ptrace.c.  You found
a giant freaking error in uaccess.h.

Am I missing something?  How are there not zillions of instances of
this that your patch ought to catch?  Or is genregs_get() really the
only example?
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 780f2b42c8ef..df0571a07b55 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -431,8 +431,10 @@ do {									\
 ({								\
 	__label__ __pu_label;					\
 	int __pu_err = -EFAULT;					\
+	__typeof__(*(ptr)) __pu_val;				\
+	__pu_val = x;						\
 	__uaccess_begin();					\
-	__put_user_size((x), (ptr), (size), __pu_label);	\
+	__put_user_size(__pu_val, (ptr), (size), __pu_label);	\
 	__pu_err = 0;						\
 __pu_label:							\
 	__uaccess_end();					\
Linus Torvalds Feb. 23, 2019, 1:12 a.m. UTC | #36
On Fri, Feb 22, 2019 at 4:34 PM Andy Lutomirski <luto@kernel.org> wrote:
>
> [mailing lists removed because this is a potentially large source of exploits]

I think you're overly worried.

AC doesn't protect against "large source of exploits". If it did, then
all CPU's before Broadwell would be insecure. They aren't. They'd
better not be, considering that there's a _lot_ of Xeon machines out
there based on older microarchitectures. I think some of them might
even be reasonably current (eg Xeon E7v3 isn't _that_ old, and is
Haswell-based, and doesn't have SMAP afaik).

SMAP is a great debugging and development aid, and makes sure that
developers - who hopefully run primarily on modern platforms - don't
write code that just accesses user space directly (because with SMAP,
it won't work).

And yes, SMAP can limit the effect of kernel bugs, and turn something
that would otherwise be a security issue into "just a bug".

But running with AC on isn't a security issue in itself - it just
makes SMAP slightly less powerful. The biggest issue of the whole "AC
doesn't get saved/redstored" is actually the *reverse* case, where a
preemption event could then cause a process that had AC on to be
scheduled away, then AC would stay on for some time, but then we might
schedule back with AC _clear_, and now you'd have a non-working user
access, and a possible DoS attack as a result because you returned
EFAULT to a system call that was perfectly fine.

See? It's not so much "AC stays on" that is a "sky is falling" issue,
it's actually "AC also gets turned off randomly" that actually has
real and immediate effects. "AC on" is unfortunate and not great,
don't get me wrong, but it's definitely not the end of the world.
Particularly not for short sequences.

That said:

> Um, wait a moment.  You didn't find an oddity in ptrace.c.  You found
> a giant freaking error in uaccess.h.

I agree that your patch is good, and should be applied. Mind writing
up a changelog and committing it to -tip?

> Am I missing something?  How are there not zillions of instances of
> this that your patch ought to catch?  Or is genregs_get() really the
> only example?

I really do think that it's very unusual to do "get/put_user()" with
complicated value arguments. So while your patch is obviously the
right thing to do, I really don't think this is a huge worry, or all
_that_ surprising that this issue apparently found just a single case
of a function call.

With all that said, I didn't even react to this part of PeterZ's
patch, but it's a good call, and I think it's also a great validation
of the objtool approach to validating AC. So cheers for that!

                Linus
Andy Lutomirski Feb. 23, 2019, 1:16 a.m. UTC | #37
On Fri, Feb 22, 2019 at 5:12 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Fri, Feb 22, 2019 at 4:34 PM Andy Lutomirski <luto@kernel.org> wrote:
> >
> > [mailing lists removed because this is a potentially large source of exploits]
>
> I think you're overly worried.
>
> AC doesn't protect against "large source of exploits".

What I meant was: a potentially large grab bag of ways to help turn a
bug into an exploit.

> That said:
>
> > Um, wait a moment.  You didn't find an oddity in ptrace.c.  You found
> > a giant freaking error in uaccess.h.
>
> I agree that your patch is good, and should be applied. Mind writing
> up a changelog and committing it to -tip?

I don't have commit access :)  But I shall email it out once I test it a bit.

--Andy
Linus Torvalds Feb. 23, 2019, 1:33 a.m. UTC | #38
On Fri, Feb 22, 2019 at 5:17 PM Andy Lutomirski <luto@kernel.org> wrote:
>
> What I meant was: a potentially large grab bag of ways to help turn a
> bug into an exploit.

Well, apparently there's only one place, and that one looks pretty harmless.

Of course, I don't think PeterZ said what his config was. Maybe it was
a very minimal config and only found that one case because all the
truly scary cases were configured out ;)

                   Linus
Linus Torvalds Feb. 23, 2019, 1:40 a.m. UTC | #39
On Fri, Feb 22, 2019 at 5:17 PM Andy Lutomirski <luto@kernel.org> wrote:
>
> I don't have commit access :)  But I shall email it out once I test it a bit.

Btw, looking at it, with that change every user of __put_user_size()
now passes in the properly type-cast value in 'x'.

Which means that you could remove the odd typecast in the 8-byte case:

        case 8:                                                         \
                __put_user_goto_u64((__typeof__(*ptr))(x), ptr, label); \

and make it match all the other cases:

        case 8:                                                         \
                __put_user_goto_u64(x, ptr, label);                     \

but that's just from looking at the patch, and maybe I'm missing something.

                  Linus
Peter Zijlstra Feb. 23, 2019, 8:37 a.m. UTC | #40
On Fri, Feb 22, 2019 at 03:55:25PM -0800, Andy Lutomirski wrote:
> On Fri, Feb 22, 2019 at 2:26 PM Peter Zijlstra <peterz@infradead.org> wrote:

> >   arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0xb1: redundant CLAC
> 
> Does objtool understand altinstr?

Yep, otherwise it would've never found any of the STAC/CLAC instructions
to begin with, they're all alternatives. The emitted code is all NOPs.

> If it understands that this is an
> altinstr associated with a not-actually-a-fuction (i.e. END not
> ENDPROC) piece of code, it could suppress this warning.

Using readelf on entry_64.o the symbol type appears to be STT_NOTYPE for
these 'functions', so yes, I can try and ignore this warning for those.

> > +#define AC_SAFE(func) \
> > +       static void __used __section(.discard.ac_safe) \
> > +               *__func_ac_safe_##func = func
> 
> Are we okay with annotating function *declarations* in a way that
> their addresses get taken whenever the declaration is processed?  It
> would be nice if we could avoid this.
> 
> I'm wondering if we can just change the code that does getreg() and
> load_gs_index() so it doesn't do it with AC set.  Also, what about
> paravirt kernels?  They'll call into PV code for load_gs_index() with
> AC set.

Fixing that code would be fine; but we need that annotation regardless,
read Linus' email a little further back, he wants things like
copy_{to,from}_user_unsafe().

Those really would need to be marked AC-safe, there's no inlining that.

That said, yes these annotations _suck_. But it's what we had and works,
I'm just copying it (from STACK_FRAME_NON_STANDARD).

That said; maybe we can do something like:

#define AC_SAFE(func)						\
	asm (".pushsection .discard.ac_safe_sym\n\t"		\
	     "999: .ascii \"" #func "\"\n\t"			\
	     ".popsection\n\t"					\
	     ".pushsection .discard.ac_safe\n\t"		\
	     _ASM_PTR " 999b\n\t"				\
	     ".popsection")

I just don't have a clue on how to read that in objtool; weak elf foo.
I'll see if I can make it work.
Peter Zijlstra Feb. 23, 2019, 8:39 a.m. UTC | #41
On Fri, Feb 22, 2019 at 03:39:48PM -0800, hpa@zytor.com wrote:
> Objtool could also detect CLAC-STAC or STAC-CLAC sequences without
> memory operations and remove them; don't know how often that happens,
> but I know it *does* happen.

Objtool doesn't know about memops; that'd be a lot of work. Also,
objtool doesn't actually rewrite the text, at best it could warn about
such occurences.
Peter Zijlstra Feb. 23, 2019, 8:43 a.m. UTC | #42
On Fri, Feb 22, 2019 at 03:34:00PM -0800, Linus Torvalds wrote:
> Can you make it do DF too while at it?

Sure.
Julien Thierry Feb. 25, 2019, 8:33 a.m. UTC | #43
On 22/02/2019 22:26, Peter Zijlstra wrote:
> On Fri, Feb 22, 2019 at 07:10:34PM +0100, Thomas Gleixner wrote:
> 
>> But correct :)
> 
>> I agree, that a function which is doing the actual copy should be callable,
>> but random other functions? NO!
> 
> So find the below patch -- which spotted fail in ptrace.c
> 
> It has an AC_SAFE(func) annotation which allows marking specific
> functions as safe to call. The patch includes 2 instances which were
> required to make arch/x86 'build':
> 
>   arch/x86/ia32/ia32_signal.o: warning: objtool: ia32_restore_sigcontext()+0x3d: call to native_load_gs_index() with AC set
>   arch/x86/kernel/ptrace.o: warning: objtool: genregs_get()+0x8e: call to getreg() with AC set
> 
> It also screams (provided one has CONFIG_FUNCTION_TRACE=y) about the
> lack of notrace annotations on functions marked AC_SAFE():
> 
>   arch/x86/kernel/ptrace.o: warning: objtool: getreg()+0x0: call to __fentry__() with AC set
> 
> It builds arch/x86 relatively clean; it only complains about some
> redundant CLACs in entry_64.S because it doesn't understand interrupts
> and I've not bothered creating an annotation for them yet.
> 
>   arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0x4d: redundant CLAC
>   arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0x5a: redundant CLAC
>   ...
>   arch/x86/entry/entry_64.o: warning: objtool: .altinstr_replacement+0xb1: redundant CLAC
> 
> Also, I realized we don't need special annotations for preempt_count;
> preempt_disable() emits a CALL instruction which should readily trigger
> the warnings added here.
> 
> The VDSO thing is a bit of a hack, but I couldn't quickly find anything
> better.
> 
> Comments?

I haven't looked at all the details. But could the annotation be called
UACCESS_SAFE() (and corresponding naming in the objtool checks)? Since
this is not an x86 only issue and the AC flags only exists for x86.

Cheers,

Julien

> 
> ---
>  arch/x86/include/asm/special_insns.h |  2 ++
>  arch/x86/kernel/ptrace.c             |  3 +-
>  include/linux/frame.h                | 23 ++++++++++++++
>  tools/objtool/arch.h                 |  4 ++-
>  tools/objtool/arch/x86/decode.c      | 14 ++++++++-
>  tools/objtool/check.c                | 59 ++++++++++++++++++++++++++++++++++++
>  tools/objtool/check.h                |  3 +-
>  tools/objtool/elf.h                  |  1 +
>  8 files changed, 105 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
> index 43c029cdc3fe..cd31e4433f4c 100644
> --- a/arch/x86/include/asm/special_insns.h
> +++ b/arch/x86/include/asm/special_insns.h
> @@ -5,6 +5,7 @@
>  
>  #ifdef __KERNEL__
>  
> +#include <linux/frame.h>
>  #include <asm/nops.h>
>  
>  /*
> @@ -135,6 +136,7 @@ static inline void native_wbinvd(void)
>  }
>  
>  extern asmlinkage void native_load_gs_index(unsigned);
> +AC_SAFE(native_load_gs_index);
>  
>  static inline unsigned long __read_cr4(void)
>  {
> diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
> index 4b8ee05dd6ad..e278b4055a8b 100644
> --- a/arch/x86/kernel/ptrace.c
> +++ b/arch/x86/kernel/ptrace.c
> @@ -420,7 +420,7 @@ static int putreg(struct task_struct *child,
>  	return 0;
>  }
>  
> -static unsigned long getreg(struct task_struct *task, unsigned long offset)
> +static notrace unsigned long getreg(struct task_struct *task, unsigned long offset)
>  {
>  	switch (offset) {
>  	case offsetof(struct user_regs_struct, cs):
> @@ -444,6 +444,7 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset)
>  
>  	return *pt_regs_access(task_pt_regs(task), offset);
>  }
> +AC_SAFE(getreg);
>  
>  static int genregs_get(struct task_struct *target,
>  		       const struct user_regset *regset,
> diff --git a/include/linux/frame.h b/include/linux/frame.h
> index 02d3ca2d9598..5d354cf42a56 100644
> --- a/include/linux/frame.h
> +++ b/include/linux/frame.h
> @@ -21,4 +21,27 @@
>  
>  #endif /* CONFIG_STACK_VALIDATION */
>  
> +#if defined(CONFIG_STACK_VALIDATION) && !defined(BUILD_VDSO) && !defined(BUILD_VDSO32)
> +/*
> + * This macro marks functions as AC-safe, that is, it is safe to call from an
> + * EFLAGS.AC enabled region (typically user_access_begin() /
> + * user_access_end()).
> + *
> + * These functions in turn will only call AC-safe functions themselves (which
> + * precludes tracing, including __fentry__ and scheduling, including
> + * preempt_enable).
> + *
> + * AC-safe functions will obviously also not change EFLAGS.AC themselves.
> + *
> + * Since STAC/CLAC are OPL-0 only, this is all irrelevant for VDSO builds
> + * (and the generated symbol reference will in fact cause link failures).
> + */
> +#define AC_SAFE(func) \
> +	static void __used __section(.discard.ac_safe) \
> +		*__func_ac_safe_##func = func
> +
> +#else
> +#define AC_SAFE(func)
> +#endif
> +
>  #endif /* _LINUX_FRAME_H */
> diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h
> index b0d7dc3d71b5..48327099466d 100644
> --- a/tools/objtool/arch.h
> +++ b/tools/objtool/arch.h
> @@ -33,7 +33,9 @@
>  #define INSN_STACK		8
>  #define INSN_BUG		9
>  #define INSN_NOP		10
> -#define INSN_OTHER		11
> +#define INSN_STAC		11
> +#define INSN_CLAC		12
> +#define INSN_OTHER		13
>  #define INSN_LAST		INSN_OTHER
>  
>  enum op_dest_type {
> diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
> index 540a209b78ab..d1e99d1460a5 100644
> --- a/tools/objtool/arch/x86/decode.c
> +++ b/tools/objtool/arch/x86/decode.c
> @@ -369,7 +369,19 @@ int arch_decode_instruction(struct elf *elf, struct section *sec,
>  
>  	case 0x0f:
>  
> -		if (op2 >= 0x80 && op2 <= 0x8f) {
> +		if (op2 == 0x01) {
> +
> +			if (modrm == 0xca) {
> +
> +				*type = INSN_CLAC;
> +
> +			} else if (modrm == 0xcb) {
> +
> +				*type = INSN_STAC;
> +
> +			}
> +
> +		} else if (op2 >= 0x80 && op2 <= 0x8f) {
>  
>  			*type = INSN_JUMP_CONDITIONAL;
>  
> diff --git a/tools/objtool/check.c b/tools/objtool/check.c
> index 0414a0d52262..01852602ca31 100644
> --- a/tools/objtool/check.c
> +++ b/tools/objtool/check.c
> @@ -127,6 +127,24 @@ static bool ignore_func(struct objtool_file *file, struct symbol *func)
>  	return false;
>  }
>  
> +static bool ac_safe_func(struct objtool_file *file, struct symbol *func)
> +{
> +	struct rela *rela;
> +
> +	/* check for AC_SAFE */
> +	if (file->ac_safe && file->ac_safe->rela)
> +		list_for_each_entry(rela, &file->ac_safe->rela->rela_list, list) {
> +			if (rela->sym->type == STT_SECTION &&
> +			    rela->sym->sec == func->sec &&
> +			    rela->addend == func->offset)
> +				return true;
> +			if (/* rela->sym->type == STT_FUNC && */ rela->sym == func)
> +				return true;
> +		}
> +
> +	return false;
> +}
> +
>  /*
>   * This checks to see if the given function is a "noreturn" function.
>   *
> @@ -439,6 +457,8 @@ static void add_ignores(struct objtool_file *file)
>  
>  	for_each_sec(file, sec) {
>  		list_for_each_entry(func, &sec->symbol_list, list) {
> +			func->ac_safe = ac_safe_func(file, func);
> +
>  			if (func->type != STT_FUNC)
>  				continue;
>  
> @@ -1902,6 +1922,11 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
>  		switch (insn->type) {
>  
>  		case INSN_RETURN:
> +			if (state.ac) {
> +				WARN_FUNC("return with AC set", sec, insn->offset);
> +				return 1;
> +			}
> +
>  			if (func && has_modified_stack_frame(&state)) {
>  				WARN_FUNC("return with modified stack frame",
>  					  sec, insn->offset);
> @@ -1917,6 +1942,12 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
>  			return 0;
>  
>  		case INSN_CALL:
> +			if ((state.ac_safe || state.ac) && !insn->call_dest->ac_safe) {
> +				WARN_FUNC("call to %s() with AC set", sec, insn->offset,
> +						insn->call_dest->name);
> +				return 1;
> +			}
> +
>  			if (is_fentry_call(insn))
>  				break;
>  
> @@ -1928,6 +1959,11 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
>  
>  			/* fallthrough */
>  		case INSN_CALL_DYNAMIC:
> +			if ((state.ac_safe || state.ac) && !insn->call_dest->ac_safe) {
> +				WARN_FUNC("call to %s() with AC set", sec, insn->offset,
> +						insn->call_dest->name);
> +				return 1;
> +			}
>  			if (!no_fp && func && !has_valid_stack_frame(&state)) {
>  				WARN_FUNC("call without frame pointer save/setup",
>  					  sec, insn->offset);
> @@ -1980,6 +2016,26 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
>  
>  			break;
>  
> +		case INSN_STAC:
> +			if (state.ac_safe || state.ac) {
> +				WARN_FUNC("recursive STAC", sec, insn->offset);
> +				return 1;
> +			}
> +			state.ac = true;
> +			break;
> +
> +		case INSN_CLAC:
> +			if (!state.ac) {
> +				WARN_FUNC("redundant CLAC", sec, insn->offset);
> +				return 1;
> +			}
> +			if (state.ac_safe) {
> +				WARN_FUNC("AC-safe clears AC", sec, insn->offset);
> +				return 1;
> +			}
> +			state.ac = false;
> +			break;
> +
>  		default:
>  			break;
>  		}
> @@ -2141,6 +2197,8 @@ static int validate_functions(struct objtool_file *file)
>  			if (!insn || insn->ignore)
>  				continue;
>  
> +			state.ac_safe = func->ac_safe;
> +
>  			ret = validate_branch(file, insn, state);
>  			warnings += ret;
>  		}
> @@ -2198,6 +2256,7 @@ int check(const char *_objname, bool orc)
>  	INIT_LIST_HEAD(&file.insn_list);
>  	hash_init(file.insn_hash);
>  	file.whitelist = find_section_by_name(file.elf, ".discard.func_stack_frame_non_standard");
> +	file.ac_safe = find_section_by_name(file.elf, ".discard.ac_safe");
>  	file.c_file = find_section_by_name(file.elf, ".comment");
>  	file.ignore_unreachables = no_unreachable;
>  	file.hints = false;
> diff --git a/tools/objtool/check.h b/tools/objtool/check.h
> index e6e8a655b556..c31ec3ca78f3 100644
> --- a/tools/objtool/check.h
> +++ b/tools/objtool/check.h
> @@ -31,7 +31,7 @@ struct insn_state {
>  	int stack_size;
>  	unsigned char type;
>  	bool bp_scratch;
> -	bool drap, end;
> +	bool drap, end, ac, ac_safe;
>  	int drap_reg, drap_offset;
>  	struct cfi_reg vals[CFI_NUM_REGS];
>  };
> @@ -61,6 +61,7 @@ struct objtool_file {
>  	struct list_head insn_list;
>  	DECLARE_HASHTABLE(insn_hash, 16);
>  	struct section *whitelist;
> +	struct section *ac_safe;
>  	bool ignore_unreachables, c_file, hints, rodata;
>  };
>  
> diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h
> index bc97ed86b9cd..064c3df31e40 100644
> --- a/tools/objtool/elf.h
> +++ b/tools/objtool/elf.h
> @@ -62,6 +62,7 @@ struct symbol {
>  	unsigned long offset;
>  	unsigned int len;
>  	struct symbol *pfunc, *cfunc;
> +	bool ac_safe;
>  };
>  
>  struct rela {
>
H. Peter Anvin Feb. 25, 2019, 8:47 a.m. UTC | #44
On February 23, 2019 12:39:42 AM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Fri, Feb 22, 2019 at 03:39:48PM -0800, hpa@zytor.com wrote:
>> Objtool could also detect CLAC-STAC or STAC-CLAC sequences without
>> memory operations and remove them; don't know how often that happens,
>> but I know it *does* happen.
>
>Objtool doesn't know about memops; that'd be a lot of work. Also,
>objtool doesn't actually rewrite the text, at best it could warn about
>such occurences.

It doesn't have to understand the contents of the memop, but it seems that the presence of a modrm with mode ≠ 3 should be plenty. It needs to know that much in order to know the length of instructions anyway. For extra credit, ignore LEA or hinting instructions.
H. Peter Anvin Feb. 25, 2019, 8:49 a.m. UTC | #45
On February 23, 2019 12:39:42 AM PST, Peter Zijlstra <peterz@infradead.org> wrote:
>On Fri, Feb 22, 2019 at 03:39:48PM -0800, hpa@zytor.com wrote:
>> Objtool could also detect CLAC-STAC or STAC-CLAC sequences without
>> memory operations and remove them; don't know how often that happens,
>> but I know it *does* happen.
>
>Objtool doesn't know about memops; that'd be a lot of work. Also,
>objtool doesn't actually rewrite the text, at best it could warn about
>such occurences.

It doesn't even have to change the text per se, just nullify the altinstr.
Peter Zijlstra Feb. 25, 2019, 10:51 a.m. UTC | #46
On Fri, Feb 22, 2019 at 03:55:25PM -0800, Andy Lutomirski wrote:
> I'm wondering if we can just change the code that does getreg() and
> load_gs_index() so it doesn't do it with AC set.  Also, what about
> paravirt kernels?  They'll call into PV code for load_gs_index() with
> AC set.

Paravirt can go bugger off. There's no sane way to fix that.

Luckily the load_gs_index() thing is part of the paravirt me harder crap
and so nobody sane should ever hit that.

I don't fully understand that code at all; I also have no clue why GS
has paravirt bits on but the other segments do not. But it looks like we
want to do that RELOAD_SEG() crap under SMAP because of the GET_SEG() ->
get_user_ex() thing.

Anyway, I only see 3 options here:

 1) live with the paravirt me harder builds complaining
 2) exclude the AC validation from the paravirt me harder builds
 3) rewrite this code to no need that stupid call in the first place


2 seems like an exceptionally bad ideal, 3 would need someone that
understands this, so for now I'll pick 1 :-)


*thought*... we could delay the actual set_user_seg() thing until after
the get_user_catch(), would that work?
Peter Zijlstra Feb. 25, 2019, 11:55 a.m. UTC | #47
On Mon, Feb 25, 2019 at 08:33:35AM +0000, Julien Thierry wrote:
> > It has an AC_SAFE(func) annotation which allows marking specific
> > functions as safe to call. The patch includes 2 instances which were
> > required to make arch/x86 'build':

> I haven't looked at all the details. But could the annotation be called
> UACCESS_SAFE() (and corresponding naming in the objtool checks)? Since
> this is not an x86 only issue and the AC flags only exists for x86.

Sure that works. Lemme sed the patches.
Peter Zijlstra Feb. 25, 2019, 1:21 p.m. UTC | #48
On Mon, Feb 25, 2019 at 12:47:00AM -0800, hpa@zytor.com wrote:
> It doesn't have to understand the contents of the memop, but it seems
> that the presence of a modrm with mode ≠ 3 should be plenty. It needs
> to know that much in order to know the length of instructions anyway.
> For extra credit, ignore LEA or hinting instructions.

A little something like so then?

arch/x86/kernel/fpu/signal.o: warning: objtool: .altinstr_replacement+0x9c: UACCESS disable without MEMOPs: copy_fpstate_to_sigframe()

00000000023c  000200000002 R_X86_64_PC32     0000000000000000 .text + 604
000000000240  000700000002 R_X86_64_PC32     0000000000000000 .altinstr_replacement + 99
000000000249  000200000002 R_X86_64_PC32     0000000000000000 .text + 610
00000000024d  000700000002 R_X86_64_PC32     0000000000000000 .altinstr_replacement + 9c

.text

604:   90                      nop
605:   90                      nop
606:   90                      nop
607:   83 ce 03                or     $0x3,%esi
60a:   89 b3 00 02 00 00       mov    %esi,0x200(%rbx)
610:   90                      nop
611:   90                      nop
612:   90                      nop

.altinstr_replacement

99:   0f 01 cb                stac
9c:   0f 01 ca                clac

Which looks like the tool failed to recognise that MOV as a memop.



---
--- a/tools/objtool/arch.h
+++ b/tools/objtool/arch.h
@@ -37,7 +37,8 @@
 #define INSN_CLAC		12
 #define INSN_STD		13
 #define INSN_CLD		14
-#define INSN_OTHER		15
+#define INSN_MEMOP		15
+#define INSN_OTHER		16
 #define INSN_LAST		INSN_OTHER
 
 enum op_dest_type {
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -123,6 +123,9 @@ int arch_decode_instruction(struct elf *
 		modrm_mod = X86_MODRM_MOD(modrm);
 		modrm_reg = X86_MODRM_REG(modrm);
 		modrm_rm = X86_MODRM_RM(modrm);
+
+		if (modrm_mod != 3)
+			*type = INSN_MEMOP;
 	}
 
 	if (insn.sib.nbytes)
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -2047,6 +2047,7 @@ static int validate_branch(struct objtoo
 				WARN_FUNC("recursive UACCESS enable", sec, insn->offset);
 
 			state.uaccess = true;
+			state.memop = false;
 			break;
 
 		case INSN_CLAC:
@@ -2058,6 +2059,9 @@ static int validate_branch(struct objtoo
 				break;
 			}
 
+			if (!state.memop && insn->func)
+				WARN_FUNC("UACCESS disable without MEMOPs: %s()", sec, insn->offset, insn->func->name);
+
 			state.uaccess = false;
 			break;
 
@@ -2075,6 +2079,10 @@ static int validate_branch(struct objtoo
 			state.df = false;
 			break;
 
+		case INSN_MEMOP:
+			state.memop = true;
+			break;
+
 		default:
 			break;
 		}
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -31,7 +31,7 @@ struct insn_state {
 	int stack_size;
 	unsigned char type;
 	bool bp_scratch;
-	bool drap, end, uaccess, uaccess_safe, df;
+	bool drap, end, uaccess, uaccess_safe, df, memop;
 	int drap_reg, drap_offset;
 	struct cfi_reg vals[CFI_NUM_REGS];
 };
Andy Lutomirski Feb. 25, 2019, 3:36 p.m. UTC | #49
> On Feb 25, 2019, at 3:53 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> 
>> On Mon, Feb 25, 2019 at 11:51:44AM +0100, Peter Zijlstra wrote:
>>> On Fri, Feb 22, 2019 at 03:55:25PM -0800, Andy Lutomirski wrote:
>>> I'm wondering if we can just change the code that does getreg() and
>>> load_gs_index() so it doesn't do it with AC set.  Also, what about
>>> paravirt kernels?  They'll call into PV code for load_gs_index() with
>>> AC set.
>> 
>> Paravirt can go bugger off. There's no sane way to fix that.
> 
>> I don't fully understand that code at all; I also have no clue why GS
>> has paravirt bits on but the other segments do not. 
> 
> *sigh* SWAPGS
> 
>> *thought*... we could delay the actual set_user_seg() thing until after
>> the get_user_catch(), would that work?
> 
> 
> How horrible / broken is this?
> 
> ---
> 
> diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
> index 321fe5f5d0e9..67c866943102 100644
> --- a/arch/x86/ia32/ia32_signal.c
> +++ b/arch/x86/ia32/ia32_signal.c
> @@ -60,17 +60,21 @@
>    regs->seg = GET_SEG(seg) | 3;            \
> } while (0)
> 
> -#define RELOAD_SEG(seg)        {        \
> -    unsigned int pre = GET_SEG(seg);    \
> -    unsigned int cur = get_user_seg(seg);    \
> -    pre |= 3;                \
> -    if (pre != cur)                \
> -        set_user_seg(seg, pre);        \
> +#define LOAD_SEG(seg)        {            \
> +    pre_##seg = 3 | GET_SEG(seg);            \
> +    cur_##seg = get_user_seg(seg);            \
> +}
> +
> +#define RELOAD_SEG(seg)        {            \
> +    if (pre_##seg != cur_##seg)            \
> +        set_user_seg(seg, pre_##seg);        \
> }
> 
> static int ia32_restore_sigcontext(struct pt_regs *regs,
>                   struct sigcontext_32 __user *sc)
> {
> +    u16 pre_gs, pre_fs, pre_ds, pre_es;
> +    u16 cur_gs, cur_fs, cur_ds, cur_es;
>    unsigned int tmpflags, err = 0;
>    void __user *buf;
>    u32 tmp;
> @@ -85,10 +89,10 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
>         * the handler, but does not clobber them at least in the
>         * normal case.
>         */
> -        RELOAD_SEG(gs);
> -        RELOAD_SEG(fs);
> -        RELOAD_SEG(ds);
> -        RELOAD_SEG(es);
> +        LOAD_SEG(gs);
> +        LOAD_SEG(fs);
> +        LOAD_SEG(ds);
> +        LOAD_SEG(es);
> 
>        COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
>        COPY(dx); COPY(cx); COPY(ip); COPY(ax);
> @@ -106,6 +110,11 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
>        buf = compat_ptr(tmp);
>    } get_user_catch(err);
> 
> +    RELOAD_SEG(gs);
> +    RELOAD_SEG(fs);
> +    RELOAD_SEG(ds);
> +    RELOAD_SEG(es);
> +
>    err |= fpu__restore_sig(buf, 1);
> 
>    force_iret();

I would call this pretty horrible. How about we do it without macros? :)

But yes, deferring the segment load until after the read seems fine to me. Frankly, we could also just copy_from_user the whole thing up front — thus code is not really a serious fast path.
Peter Zijlstra March 1, 2019, 3:07 p.m. UTC | #50
On Mon, Feb 25, 2019 at 02:21:03PM +0100, Peter Zijlstra wrote:
> On Mon, Feb 25, 2019 at 12:47:00AM -0800, hpa@zytor.com wrote:
> > It doesn't have to understand the contents of the memop, but it seems
> > that the presence of a modrm with mode ≠ 3 should be plenty. It needs
> > to know that much in order to know the length of instructions anyway.
> > For extra credit, ignore LEA or hinting instructions.
> 
> A little something like so then?


$ ./objtool check --no-fp --backtrace ../../defconfig-build/arch/x86/lib/usercopy_64.o
../../defconfig-build/arch/x86/lib/usercopy_64.o: warning: objtool: .altinstr_replacement+0x3: UACCESS disable without MEMOPs: __clear_user()
../../defconfig-build/arch/x86/lib/usercopy_64.o: warning: objtool: __clear_user()+0x3a: (alt)
../../defconfig-build/arch/x86/lib/usercopy_64.o: warning: objtool: __clear_user()+0x2e: (branch)
../../defconfig-build/arch/x86/lib/usercopy_64.o: warning: objtool: __clear_user()+0x18: (branch)
../../defconfig-build/arch/x86/lib/usercopy_64.o: warning: objtool: .altinstr_replacement+0xffffffffffffffff: (branch)
../../defconfig-build/arch/x86/lib/usercopy_64.o: warning: objtool: __clear_user()+0x5: (alt)
../../defconfig-build/arch/x86/lib/usercopy_64.o: warning: objtool: __clear_user()+0x0: <=== (func)


0000000000000000 <__clear_user>:
   0:   e8 00 00 00 00          callq  5 <__clear_user+0x5>
   1: R_X86_64_PLT32       __fentry__-0x4
   5:   90                      nop
   6:   90                      nop
   7:   90                      nop
   8:   48 89 f0                mov    %rsi,%rax
   b:   48 c1 ee 03             shr    $0x3,%rsi
   f:   83 e0 07                and    $0x7,%eax
  12:   48 89 f1                mov    %rsi,%rcx
  15:   48 85 c9                test   %rcx,%rcx
  18:   74 0f                   je     29 <__clear_user+0x29>
  1a:   48 c7 07 00 00 00 00    movq   $0x0,(%rdi)
  21:   48 83 c7 08             add    $0x8,%rdi
  25:   ff c9                   dec    %ecx
  27:   75 f1                   jne    1a <__clear_user+0x1a>
  29:   48 89 c1                mov    %rax,%rcx
  2c:   85 c9                   test   %ecx,%ecx
  2e:   74 0a                   je     3a <__clear_user+0x3a>
  30:   c6 07 00                movb   $0x0,(%rdi)
  33:   48 ff c7                inc    %rdi
  36:   ff c9                   dec    %ecx
  38:   75 f6                   jne    30 <__clear_user+0x30>
  3a:   90                      nop
  3b:   90                      nop
  3c:   90                      nop
  3d:   48 89 c8                mov    %rcx,%rax
  40:   c3                      retq


Seems correct. Not sure you want to go fix that though. Let me know if
you want more output.
diff mbox series

Patch

--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -650,6 +650,7 @@  ENTRY(__switch_to_asm)
 	pushl	%ebx
 	pushl	%edi
 	pushl	%esi
+	pushfl
 
 	/* switch stack */
 	movl	%esp, TASK_threadsp(%eax)
@@ -672,6 +673,7 @@  ENTRY(__switch_to_asm)
 #endif
 
 	/* restore callee-saved registers */
+	popfl
 	popl	%esi
 	popl	%edi
 	popl	%ebx
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -291,6 +291,7 @@  ENTRY(__switch_to_asm)
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
+	pushfq
 
 	/* switch stack */
 	movq	%rsp, TASK_threadsp(%rdi)
@@ -313,6 +314,7 @@  ENTRY(__switch_to_asm)
 #endif
 
 	/* restore callee-saved registers */
+	popfq
 	popq	%r15
 	popq	%r14
 	popq	%r13
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -40,6 +40,7 @@  asmlinkage void ret_from_fork(void);
  * order of the fields must match the code in __switch_to_asm().
  */
 struct inactive_task_frame {
+	unsigned long flags;
 #ifdef CONFIG_X86_64
 	unsigned long r15;
 	unsigned long r14;