Message ID | f2fa29790fb19e6596e1cda0d441e0f45730b533.1339614945.git.otubo@linux.vnet.ibm.com |
---|---|
State | New |
Headers | show |
On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo <otubo@linux.vnet.ibm.com> wrote: > I added a syscall struct using priority levels as described in the > libseccomp man page. The priority numbers are based to the frequency > they appear in a sample strace from a regular qemu guest run under > libvirt. > > Libseccomp generates linear BPF code to filter system calls, those rules > are read one after another. The priority system places the most common > rules first in order to reduce the overhead when processing them. > > Also, since this is just a first RFC, the whitelist is a little raw. We > might need your help to improve, test and fine tune the set of system > calls. > > v2: Fixed some style issues > Removed code from vl.c and created qemu-seccomp.[ch] > Now using ARRAY_SIZE macro > Added more syscalls without priority/frequency set yet > > Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com> > --- > qemu-seccomp.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > qemu-seccomp.h | 9 +++++++ > vl.c | 7 ++++++ > 3 files changed, 89 insertions(+) > create mode 100644 qemu-seccomp.c > create mode 100644 qemu-seccomp.h > > diff --git a/qemu-seccomp.c b/qemu-seccomp.c > new file mode 100644 > index 0000000..048b7ba > --- /dev/null > +++ b/qemu-seccomp.c > @@ -0,0 +1,73 @@ Copyright and license info missing. > +#include <stdio.h> > +#include <seccomp.h> > +#include "qemu-seccomp.h" > + > +static struct QemuSeccompSyscall seccomp_whitelist[] = { 'const' > + { SCMP_SYS(timer_settime), 255 }, > + { SCMP_SYS(timer_gettime), 254 }, > + { SCMP_SYS(futex), 253 }, > + { SCMP_SYS(select), 252 }, > + { SCMP_SYS(recvfrom), 251 }, > + { SCMP_SYS(sendto), 250 }, > + { SCMP_SYS(read), 249 }, > + { SCMP_SYS(brk), 248 }, > + { SCMP_SYS(clone), 247 }, > + { SCMP_SYS(mmap), 247 }, > + { SCMP_SYS(mprotect), 246 }, > + { SCMP_SYS(ioctl), 245 }, > + { SCMP_SYS(recvmsg), 245 }, > + { SCMP_SYS(sendmsg), 245 }, > + { SCMP_SYS(accept), 245 }, > + { SCMP_SYS(connect), 245 }, > + { SCMP_SYS(bind), 245 }, It would be nice to avoid connect() and bind(). Perhaps seccomp init should be postponed to after all sockets have been created? > + { SCMP_SYS(listen), 245 }, > + { SCMP_SYS(ioctl), 245 }, > + { SCMP_SYS(eventfd), 245 }, > + { SCMP_SYS(rt_sigprocmask), 245 }, > + { SCMP_SYS(write), 244 }, > + { SCMP_SYS(fcntl), 243 }, > + { SCMP_SYS(tgkill), 242 }, > + { SCMP_SYS(rt_sigaction), 242 }, > + { SCMP_SYS(pipe2), 242 }, > + { SCMP_SYS(munmap), 242 }, > + { SCMP_SYS(mremap), 242 }, > + { SCMP_SYS(getsockname), 242 }, > + { SCMP_SYS(getpeername), 242 }, > + { SCMP_SYS(fdatasync), 242 }, > + { SCMP_SYS(close), 242 } > +}; > + > +#define seccomp_whitelist_count ARRAY_SIZE(seccomp_whitelist) I'm not sure the #define helps much. > + > +int seccomp_start(void) > +{ > + int rc = 0; > + unsigned int i = 0; > + > + rc = seccomp_init(SCMP_ACT_KILL); > + if (rc < 0) { > + goto seccomp_return; > + } > + > + for (i = 0; i < seccomp_whitelist_count; i++) { > + rc = seccomp_rule_add(SCMP_ACT_ALLOW, seccomp_whitelist[i].num, 0); > + if (rc < 0) { > + goto seccomp_return; > + } > + rc = seccomp_syscall_priority(seccomp_whitelist[i].num, > + seccomp_whitelist[i].priority); > + if (rc < 0) { > + goto seccomp_return; > + } > + } > + > + rc = seccomp_load(); > + > + seccomp_return: > + seccomp_release(); > + if (rc < 0) { > + fprintf(stderr, > + "ERROR: failed to configure the seccomp syscall filter in the kernel\n"); Should this be fatal? > + } > + return rc; Return value is not used. > +} > diff --git a/qemu-seccomp.h b/qemu-seccomp.h > new file mode 100644 > index 0000000..3bbdd87 > --- /dev/null > +++ b/qemu-seccomp.h > @@ -0,0 +1,9 @@ Usual header protection #ifndeffery missing. > +#include <seccomp.h> > +#include "osdep.h" > + > +struct QemuSeccompSyscall { > + int32_t num; > + uint8_t priority; > +}; This definition is not used elsewhere, so it should be internal to qemu-seccomp.c. > + > +int seccomp_start(void); > diff --git a/vl.c b/vl.c > index 204d85b..315afaf 100644 > --- a/vl.c > +++ b/vl.c > @@ -61,6 +61,9 @@ > > #include <linux/ppdev.h> > #include <linux/parport.h> > +#ifdef CONFIG_LIBSECCOMP > +#include "qemu-seccomp.h" > +#endif > #endif > #ifdef __sun__ > #include <sys/stat.h> > @@ -2296,6 +2299,10 @@ int main(int argc, char **argv, char **envp) > const char *trace_events = NULL; > const char *trace_file = NULL; > > +#ifdef CONFIG_LIBSECCOMP > + seccomp_start(); > +#endif > + > atexit(qemu_run_exit_notifiers); > error_set_progname(argv[0]); > > -- > 1.7.9.5 > >
On Wed, Jun 13, 2012 at 04:20:22PM -0300, Eduardo Otubo wrote: > I added a syscall struct using priority levels as described in the > libseccomp man page. The priority numbers are based to the frequency > they appear in a sample strace from a regular qemu guest run under > libvirt. > > Libseccomp generates linear BPF code to filter system calls, those rules > are read one after another. The priority system places the most common > rules first in order to reduce the overhead when processing them. > > Also, since this is just a first RFC, the whitelist is a little raw. We > might need your help to improve, test and fine tune the set of system > calls. > > v2: Fixed some style issues > Removed code from vl.c and created qemu-seccomp.[ch] > Now using ARRAY_SIZE macro > Added more syscalls without priority/frequency set yet > > Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com> > --- > qemu-seccomp.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > qemu-seccomp.h | 9 +++++++ > vl.c | 7 ++++++ > 3 files changed, 89 insertions(+) > create mode 100644 qemu-seccomp.c > create mode 100644 qemu-seccomp.h > > diff --git a/qemu-seccomp.c b/qemu-seccomp.c > new file mode 100644 > index 0000000..048b7ba > --- /dev/null > +++ b/qemu-seccomp.c > @@ -0,0 +1,73 @@ > +#include <stdio.h> > +#include <seccomp.h> > +#include "qemu-seccomp.h" > + > +static struct QemuSeccompSyscall seccomp_whitelist[] = { > + { SCMP_SYS(timer_settime), 255 }, > + { SCMP_SYS(timer_gettime), 254 }, > + { SCMP_SYS(futex), 253 }, > + { SCMP_SYS(select), 252 }, > + { SCMP_SYS(recvfrom), 251 }, > + { SCMP_SYS(sendto), 250 }, > + { SCMP_SYS(read), 249 }, > + { SCMP_SYS(brk), 248 }, > + { SCMP_SYS(clone), 247 }, > + { SCMP_SYS(mmap), 247 }, > + { SCMP_SYS(mprotect), 246 }, > + { SCMP_SYS(ioctl), 245 }, > + { SCMP_SYS(recvmsg), 245 }, > + { SCMP_SYS(sendmsg), 245 }, > + { SCMP_SYS(accept), 245 }, > + { SCMP_SYS(connect), 245 }, > + { SCMP_SYS(bind), 245 }, > + { SCMP_SYS(listen), 245 }, > + { SCMP_SYS(ioctl), 245 }, > + { SCMP_SYS(eventfd), 245 }, > + { SCMP_SYS(rt_sigprocmask), 245 }, > + { SCMP_SYS(write), 244 }, > + { SCMP_SYS(fcntl), 243 }, > + { SCMP_SYS(tgkill), 242 }, > + { SCMP_SYS(rt_sigaction), 242 }, > + { SCMP_SYS(pipe2), 242 }, > + { SCMP_SYS(munmap), 242 }, > + { SCMP_SYS(mremap), 242 }, > + { SCMP_SYS(getsockname), 242 }, > + { SCMP_SYS(getpeername), 242 }, > + { SCMP_SYS(fdatasync), 242 }, > + { SCMP_SYS(close), 242 } execve(), so QEMU can run things like the ifup/down scripts, the samba daemon (sic), exec: migration protocol, etc, etc Daniel
On Wed, Jun 13, 2012 at 07:56:06PM +0000, Blue Swirl wrote: > On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo <otubo@linux.vnet.ibm.com> wrote: > > I added a syscall struct using priority levels as described in the > > libseccomp man page. The priority numbers are based to the frequency > > they appear in a sample strace from a regular qemu guest run under > > libvirt. > > > > Libseccomp generates linear BPF code to filter system calls, those rules > > are read one after another. The priority system places the most common > > rules first in order to reduce the overhead when processing them. > > > > Also, since this is just a first RFC, the whitelist is a little raw. We > > might need your help to improve, test and fine tune the set of system > > calls. > > > > v2: Fixed some style issues > > Removed code from vl.c and created qemu-seccomp.[ch] > > Now using ARRAY_SIZE macro > > Added more syscalls without priority/frequency set yet > > > > Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com> > > --- > > qemu-seccomp.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > > qemu-seccomp.h | 9 +++++++ > > vl.c | 7 ++++++ > > 3 files changed, 89 insertions(+) > > create mode 100644 qemu-seccomp.c > > create mode 100644 qemu-seccomp.h > > > > diff --git a/qemu-seccomp.c b/qemu-seccomp.c > > new file mode 100644 > > index 0000000..048b7ba > > --- /dev/null > > +++ b/qemu-seccomp.c > > @@ -0,0 +1,73 @@ > > Copyright and license info missing. > > > +#include <stdio.h> > > +#include <seccomp.h> > > +#include "qemu-seccomp.h" > > + > > +static struct QemuSeccompSyscall seccomp_whitelist[] = { > > 'const' > > > + { SCMP_SYS(timer_settime), 255 }, > > + { SCMP_SYS(timer_gettime), 254 }, > > + { SCMP_SYS(futex), 253 }, > > + { SCMP_SYS(select), 252 }, > > + { SCMP_SYS(recvfrom), 251 }, > > + { SCMP_SYS(sendto), 250 }, > > + { SCMP_SYS(read), 249 }, > > + { SCMP_SYS(brk), 248 }, > > + { SCMP_SYS(clone), 247 }, > > + { SCMP_SYS(mmap), 247 }, > > + { SCMP_SYS(mprotect), 246 }, > > + { SCMP_SYS(ioctl), 245 }, > > + { SCMP_SYS(recvmsg), 245 }, > > + { SCMP_SYS(sendmsg), 245 }, > > + { SCMP_SYS(accept), 245 }, > > + { SCMP_SYS(connect), 245 }, > > + { SCMP_SYS(bind), 245 }, > > It would be nice to avoid connect() and bind(). Perhaps seccomp init > should be postponed to after all sockets have been created? If you want to migrate your guest, you need to be able to call connect() at an arbitrary point in the QEMU process' lifecycle. So you can't avoid allowing connect(). Similarly if you want to allow hotplug of NICs (and their backends) then you need to have both bind() + connect() available. Daniel
On Wed, Jun 13, 2012 at 04:20:22PM -0300, Eduardo Otubo wrote: > I added a syscall struct using priority levels as described in the > libseccomp man page. The priority numbers are based to the frequency > they appear in a sample strace from a regular qemu guest run under > libvirt. > > Libseccomp generates linear BPF code to filter system calls, those rules > are read one after another. The priority system places the most common > rules first in order to reduce the overhead when processing them. > > Also, since this is just a first RFC, the whitelist is a little raw. We > might need your help to improve, test and fine tune the set of system > calls. > > v2: Fixed some style issues > Removed code from vl.c and created qemu-seccomp.[ch] > Now using ARRAY_SIZE macro > Added more syscalls without priority/frequency set yet > > +static struct QemuSeccompSyscall seccomp_whitelist[] = { > + { SCMP_SYS(timer_settime), 255 }, > + { SCMP_SYS(timer_gettime), 254 }, > + { SCMP_SYS(futex), 253 }, > + { SCMP_SYS(select), 252 }, > + { SCMP_SYS(recvfrom), 251 }, > + { SCMP_SYS(sendto), 250 }, > + { SCMP_SYS(read), 249 }, > + { SCMP_SYS(brk), 248 }, > + { SCMP_SYS(clone), 247 }, > + { SCMP_SYS(mmap), 247 }, > + { SCMP_SYS(mprotect), 246 }, > + { SCMP_SYS(ioctl), 245 }, > + { SCMP_SYS(recvmsg), 245 }, > + { SCMP_SYS(sendmsg), 245 }, > + { SCMP_SYS(accept), 245 }, > + { SCMP_SYS(connect), 245 }, > + { SCMP_SYS(bind), 245 }, > + { SCMP_SYS(listen), 245 }, > + { SCMP_SYS(ioctl), 245 }, > + { SCMP_SYS(eventfd), 245 }, > + { SCMP_SYS(rt_sigprocmask), 245 }, > + { SCMP_SYS(write), 244 }, > + { SCMP_SYS(fcntl), 243 }, > + { SCMP_SYS(tgkill), 242 }, > + { SCMP_SYS(rt_sigaction), 242 }, > + { SCMP_SYS(pipe2), 242 }, > + { SCMP_SYS(munmap), 242 }, > + { SCMP_SYS(mremap), 242 }, > + { SCMP_SYS(getsockname), 242 }, > + { SCMP_SYS(getpeername), 242 }, > + { SCMP_SYS(fdatasync), 242 }, > + { SCMP_SYS(close), 242 } > +}; Hmm, this is still missing the 'open' syscall. How does QEMU work at all with these patches applied, without this syscall listed ? Daniel
On Wed, Jun 13, 2012 at 8:33 PM, Daniel P. Berrange <berrange@redhat.com> wrote: > On Wed, Jun 13, 2012 at 07:56:06PM +0000, Blue Swirl wrote: >> On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo <otubo@linux.vnet.ibm.com> wrote: >> > I added a syscall struct using priority levels as described in the >> > libseccomp man page. The priority numbers are based to the frequency >> > they appear in a sample strace from a regular qemu guest run under >> > libvirt. >> > >> > Libseccomp generates linear BPF code to filter system calls, those rules >> > are read one after another. The priority system places the most common >> > rules first in order to reduce the overhead when processing them. >> > >> > Also, since this is just a first RFC, the whitelist is a little raw. We >> > might need your help to improve, test and fine tune the set of system >> > calls. >> > >> > v2: Fixed some style issues >> > Removed code from vl.c and created qemu-seccomp.[ch] >> > Now using ARRAY_SIZE macro >> > Added more syscalls without priority/frequency set yet >> > >> > Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com> >> > --- >> > qemu-seccomp.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> > qemu-seccomp.h | 9 +++++++ >> > vl.c | 7 ++++++ >> > 3 files changed, 89 insertions(+) >> > create mode 100644 qemu-seccomp.c >> > create mode 100644 qemu-seccomp.h >> > >> > diff --git a/qemu-seccomp.c b/qemu-seccomp.c >> > new file mode 100644 >> > index 0000000..048b7ba >> > --- /dev/null >> > +++ b/qemu-seccomp.c >> > @@ -0,0 +1,73 @@ >> >> Copyright and license info missing. >> >> > +#include <stdio.h> >> > +#include <seccomp.h> >> > +#include "qemu-seccomp.h" >> > + >> > +static struct QemuSeccompSyscall seccomp_whitelist[] = { >> >> 'const' >> >> > + { SCMP_SYS(timer_settime), 255 }, >> > + { SCMP_SYS(timer_gettime), 254 }, >> > + { SCMP_SYS(futex), 253 }, >> > + { SCMP_SYS(select), 252 }, >> > + { SCMP_SYS(recvfrom), 251 }, >> > + { SCMP_SYS(sendto), 250 }, >> > + { SCMP_SYS(read), 249 }, >> > + { SCMP_SYS(brk), 248 }, >> > + { SCMP_SYS(clone), 247 }, >> > + { SCMP_SYS(mmap), 247 }, >> > + { SCMP_SYS(mprotect), 246 }, >> > + { SCMP_SYS(ioctl), 245 }, >> > + { SCMP_SYS(recvmsg), 245 }, >> > + { SCMP_SYS(sendmsg), 245 }, >> > + { SCMP_SYS(accept), 245 }, >> > + { SCMP_SYS(connect), 245 }, >> > + { SCMP_SYS(bind), 245 }, >> >> It would be nice to avoid connect() and bind(). Perhaps seccomp init >> should be postponed to after all sockets have been created? > > If you want to migrate your guest, you need to be able to > call connect() at an arbitrary point in the QEMU process' > lifecycle. So you can't avoid allowing connect(). Similarly > if you want to allow hotplug of NICs (and their backends) > then you need to have both bind() + connect() available. That's bad. Migration could conceivably be extended to use file descriptor passing, but hotplug is more tricky. It's probably easier to split QEMU to two processes, one with greater rights like bind(), connect() etc. and another with tighter set. > > > Daniel > -- > |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| > |: http://libvirt.org -o- http://virt-manager.org :| > |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| > |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
On Wed, Jun 13, 2012 at 8:30 PM, Daniel P. Berrange <berrange@redhat.com> wrote: > On Wed, Jun 13, 2012 at 04:20:22PM -0300, Eduardo Otubo wrote: >> I added a syscall struct using priority levels as described in the >> libseccomp man page. The priority numbers are based to the frequency >> they appear in a sample strace from a regular qemu guest run under >> libvirt. >> >> Libseccomp generates linear BPF code to filter system calls, those rules >> are read one after another. The priority system places the most common >> rules first in order to reduce the overhead when processing them. >> >> Also, since this is just a first RFC, the whitelist is a little raw. We >> might need your help to improve, test and fine tune the set of system >> calls. >> >> v2: Fixed some style issues >> Removed code from vl.c and created qemu-seccomp.[ch] >> Now using ARRAY_SIZE macro >> Added more syscalls without priority/frequency set yet >> >> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com> >> --- >> qemu-seccomp.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> qemu-seccomp.h | 9 +++++++ >> vl.c | 7 ++++++ >> 3 files changed, 89 insertions(+) >> create mode 100644 qemu-seccomp.c >> create mode 100644 qemu-seccomp.h >> >> diff --git a/qemu-seccomp.c b/qemu-seccomp.c >> new file mode 100644 >> index 0000000..048b7ba >> --- /dev/null >> +++ b/qemu-seccomp.c >> @@ -0,0 +1,73 @@ >> +#include <stdio.h> >> +#include <seccomp.h> >> +#include "qemu-seccomp.h" >> + >> +static struct QemuSeccompSyscall seccomp_whitelist[] = { >> + { SCMP_SYS(timer_settime), 255 }, >> + { SCMP_SYS(timer_gettime), 254 }, >> + { SCMP_SYS(futex), 253 }, >> + { SCMP_SYS(select), 252 }, >> + { SCMP_SYS(recvfrom), 251 }, >> + { SCMP_SYS(sendto), 250 }, >> + { SCMP_SYS(read), 249 }, >> + { SCMP_SYS(brk), 248 }, >> + { SCMP_SYS(clone), 247 }, >> + { SCMP_SYS(mmap), 247 }, >> + { SCMP_SYS(mprotect), 246 }, >> + { SCMP_SYS(ioctl), 245 }, >> + { SCMP_SYS(recvmsg), 245 }, >> + { SCMP_SYS(sendmsg), 245 }, >> + { SCMP_SYS(accept), 245 }, >> + { SCMP_SYS(connect), 245 }, >> + { SCMP_SYS(bind), 245 }, >> + { SCMP_SYS(listen), 245 }, >> + { SCMP_SYS(ioctl), 245 }, >> + { SCMP_SYS(eventfd), 245 }, >> + { SCMP_SYS(rt_sigprocmask), 245 }, >> + { SCMP_SYS(write), 244 }, >> + { SCMP_SYS(fcntl), 243 }, >> + { SCMP_SYS(tgkill), 242 }, >> + { SCMP_SYS(rt_sigaction), 242 }, >> + { SCMP_SYS(pipe2), 242 }, >> + { SCMP_SYS(munmap), 242 }, >> + { SCMP_SYS(mremap), 242 }, >> + { SCMP_SYS(getsockname), 242 }, >> + { SCMP_SYS(getpeername), 242 }, >> + { SCMP_SYS(fdatasync), 242 }, >> + { SCMP_SYS(close), 242 } > > execve(), so QEMU can run things like the ifup/down > scripts, the samba daemon (sic), exec: migration protocol, > etc, etc I think allowing execve() would render seccomp pretty much useless. > > > Daniel > -- > |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| > |: http://libvirt.org -o- http://virt-manager.org :| > |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| > |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :| >
On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
> I think allowing execve() would render seccomp pretty much useless.
Not necessarily.
I'll agree that it does seem a bit odd to allow execve(), but there is still
value in enabling seccomp to disable potentially buggy/exploitable syscalls.
Let's not forget that we have over 300 syscalls on x86_64, not including the
32 bit versions, and even if we add all of the new syscalls suggested in this
thread we are still talking about a small subset of syscalls. As far as
security goes, the old adage of "less is more" applies.
Protecting against the abuse and misuse of execve() is something that is
better done with the host's access controls (traditional DAC, MAC via the LSM,
etc.).
On Fri, Jun 15, 2012 at 9:02 PM, Paul Moore <pmoore@redhat.com> wrote: > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: >> I think allowing execve() would render seccomp pretty much useless. > > Not necessarily. > > I'll agree that it does seem a bit odd to allow execve(), but there is still > value in enabling seccomp to disable potentially buggy/exploitable syscalls. > Let's not forget that we have over 300 syscalls on x86_64, not including the > 32 bit versions, and even if we add all of the new syscalls suggested in this > thread we are still talking about a small subset of syscalls. As far as > security goes, the old adage of "less is more" applies. The helper program being executed could need any of the 300 system calls, so we'd have to allow all. > > Protecting against the abuse and misuse of execve() is something that is > better done with the host's access controls (traditional DAC, MAC via the LSM, > etc.). How about seccomp mode selected by command line switch -seccomp, in which bind/connect/open/execve are forbidden? The functionality remaining would be somewhat limited (can't migrate or use SMB etc. until refactoring of QEMU), but that way seccomp jail would be much tighter. > > -- > paul moore > security and virtualization @ redhat >
On Friday, June 15, 2012 09:23:46 PM Blue Swirl wrote: > On Fri, Jun 15, 2012 at 9:02 PM, Paul Moore <pmoore@redhat.com> wrote: > > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: > >> I think allowing execve() would render seccomp pretty much useless. > > > > Not necessarily. > > > > I'll agree that it does seem a bit odd to allow execve(), but there is > > still value in enabling seccomp to disable potentially buggy/exploitable > > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not > > including the 32 bit versions, and even if we add all of the new syscalls > > suggested in this thread we are still talking about a small subset of > > syscalls. As far as security goes, the old adage of "less is more" > > applies. > > The helper program being executed could need any of the 300 system > calls, so we'd have to allow all. Don't we have some basic understanding of what the applications being exec'd will need to do? I sorta see your point, but allowing the entire set of syscalls seems a bit dramatic. > > Protecting against the abuse and misuse of execve() is something that is > > better done with the host's access controls (traditional DAC, MAC via the > > LSM, etc.). > > How about seccomp mode selected by command line switch -seccomp, in > which bind/connect/open/execve are forbidden? The functionality > remaining would be somewhat limited (can't migrate or use SMB etc. > until refactoring of QEMU), but that way seccomp jail would be much > tighter. When I spoke to Anthony about this earlier (offline, sorry) he was opposed to requiring any switches or user interaction to enable seccomp. I'm not sure if his stance on this has changed any over the past few months. In my perfect world, we would have a decomposed QEMU that functions as a series of processes connected via some sort of IPC; the exact divisions are a bit TBD and beyond the scope of this discussion. In this scenario we would be able to restrict QEMU with sVirt and seccomp to a much higher degree than we could with the current monolithic QEMU. I don't expect to see my perfect world any time soon, but in the meantime we can still improve the security of QEMU on Linux with these seccomp patches and for that reason I think it's a win. Since these patches don't expose anything at runtime (no knobs, switches, etc.) we leave ourselves plenty of flexibility for changing things in the future.
On 06/15/2012 03:23 PM, Blue Swirl wrote: > How about seccomp mode selected by command line switch -seccomp, in > which bind/connect/open/execve are forbidden? The functionality > remaining would be somewhat limited (can't migrate or use SMB etc. More properly, can't migrate with exec:command migration. But fd:nnn migration should still be viable.
On Fri, Jun 15, 2012 at 9:36 PM, Paul Moore <pmoore@redhat.com> wrote: > On Friday, June 15, 2012 09:23:46 PM Blue Swirl wrote: >> On Fri, Jun 15, 2012 at 9:02 PM, Paul Moore <pmoore@redhat.com> wrote: >> > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: >> >> I think allowing execve() would render seccomp pretty much useless. >> > >> > Not necessarily. >> > >> > I'll agree that it does seem a bit odd to allow execve(), but there is >> > still value in enabling seccomp to disable potentially buggy/exploitable >> > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not >> > including the 32 bit versions, and even if we add all of the new syscalls >> > suggested in this thread we are still talking about a small subset of >> > syscalls. As far as security goes, the old adage of "less is more" >> > applies. >> >> The helper program being executed could need any of the 300 system >> calls, so we'd have to allow all. > > Don't we have some basic understanding of what the applications being exec'd > will need to do? I sorta see your point, but allowing the entire set of > syscalls seems a bit dramatic. At least qemu-ifup/down scripts, migration exec and smbd have been mentioned. Only the system calls made by smbd (for some version of it) can be known. The user could specify arbitrary commands for the others, those could be assumed to use some common (large) subset of system calls but I think the security value would be close to zero then. > >> > Protecting against the abuse and misuse of execve() is something that is >> > better done with the host's access controls (traditional DAC, MAC via the >> > LSM, etc.). >> >> How about seccomp mode selected by command line switch -seccomp, in >> which bind/connect/open/execve are forbidden? The functionality >> remaining would be somewhat limited (can't migrate or use SMB etc. >> until refactoring of QEMU), but that way seccomp jail would be much >> tighter. > > When I spoke to Anthony about this earlier (offline, sorry) he was opposed to > requiring any switches or user interaction to enable seccomp. I'm not sure if > his stance on this has changed any over the past few months. There could be two modes, strict mode (-seccomp) and default mode (only some syscalls blocked). With the future decomposed QEMU, strict seccomp mode would be default and the switch would be obsoleted. If the decomposition is planned to happen soonish, adding the switch would be just churn. > > In my perfect world, we would have a decomposed QEMU that functions as a > series of processes connected via some sort of IPC; the exact divisions are a > bit TBD and beyond the scope of this discussion. In this scenario we would be > able to restrict QEMU with sVirt and seccomp to a much higher degree than we > could with the current monolithic QEMU. > > I don't expect to see my perfect world any time soon, but in the meantime we > can still improve the security of QEMU on Linux with these seccomp patches and > for that reason I think it's a win. Since these patches don't expose anything > at runtime (no knobs, switches, etc.) we leave ourselves plenty of flexibility > for changing things in the future. Yes, I'm much in favor of adding seccomp support soon. But I just wonder if this is really the best level of security we can reach now, not assuming decomposed QEMU, but just minor tweaks? > > -- > paul moore > security and virtualization @ redhat >
On Fri, Jun 15, 2012 at 07:06:10PM +0000, Blue Swirl wrote: > On Wed, Jun 13, 2012 at 8:30 PM, Daniel P. Berrange <berrange@redhat.com> wrote: > > On Wed, Jun 13, 2012 at 04:20:22PM -0300, Eduardo Otubo wrote: > >> I added a syscall struct using priority levels as described in the > >> libseccomp man page. The priority numbers are based to the frequency > >> they appear in a sample strace from a regular qemu guest run under > >> libvirt. > >> > >> Libseccomp generates linear BPF code to filter system calls, those rules > >> are read one after another. The priority system places the most common > >> rules first in order to reduce the overhead when processing them. > >> > >> Also, since this is just a first RFC, the whitelist is a little raw. We > >> might need your help to improve, test and fine tune the set of system > >> calls. > >> > >> v2: Fixed some style issues > >> Removed code from vl.c and created qemu-seccomp.[ch] > >> Now using ARRAY_SIZE macro > >> Added more syscalls without priority/frequency set yet > >> > >> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com> > >> --- > >> qemu-seccomp.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > >> qemu-seccomp.h | 9 +++++++ > >> vl.c | 7 ++++++ > >> 3 files changed, 89 insertions(+) > >> create mode 100644 qemu-seccomp.c > >> create mode 100644 qemu-seccomp.h > >> > >> diff --git a/qemu-seccomp.c b/qemu-seccomp.c > >> new file mode 100644 > >> index 0000000..048b7ba > >> --- /dev/null > >> +++ b/qemu-seccomp.c > >> @@ -0,0 +1,73 @@ > >> +#include <stdio.h> > >> +#include <seccomp.h> > >> +#include "qemu-seccomp.h" > >> + > >> +static struct QemuSeccompSyscall seccomp_whitelist[] = { > >> + { SCMP_SYS(timer_settime), 255 }, > >> + { SCMP_SYS(timer_gettime), 254 }, > >> + { SCMP_SYS(futex), 253 }, > >> + { SCMP_SYS(select), 252 }, > >> + { SCMP_SYS(recvfrom), 251 }, > >> + { SCMP_SYS(sendto), 250 }, > >> + { SCMP_SYS(read), 249 }, > >> + { SCMP_SYS(brk), 248 }, > >> + { SCMP_SYS(clone), 247 }, > >> + { SCMP_SYS(mmap), 247 }, > >> + { SCMP_SYS(mprotect), 246 }, > >> + { SCMP_SYS(ioctl), 245 }, > >> + { SCMP_SYS(recvmsg), 245 }, > >> + { SCMP_SYS(sendmsg), 245 }, > >> + { SCMP_SYS(accept), 245 }, > >> + { SCMP_SYS(connect), 245 }, > >> + { SCMP_SYS(bind), 245 }, > >> + { SCMP_SYS(listen), 245 }, > >> + { SCMP_SYS(ioctl), 245 }, > >> + { SCMP_SYS(eventfd), 245 }, > >> + { SCMP_SYS(rt_sigprocmask), 245 }, > >> + { SCMP_SYS(write), 244 }, > >> + { SCMP_SYS(fcntl), 243 }, > >> + { SCMP_SYS(tgkill), 242 }, > >> + { SCMP_SYS(rt_sigaction), 242 }, > >> + { SCMP_SYS(pipe2), 242 }, > >> + { SCMP_SYS(munmap), 242 }, > >> + { SCMP_SYS(mremap), 242 }, > >> + { SCMP_SYS(getsockname), 242 }, > >> + { SCMP_SYS(getpeername), 242 }, > >> + { SCMP_SYS(fdatasync), 242 }, > >> + { SCMP_SYS(close), 242 } > > > > execve(), so QEMU can run things like the ifup/down > > scripts, the samba daemon (sic), exec: migration protocol, > > etc, etc > > I think allowing execve() would render seccomp pretty much useless. So do I, but in the previous posting it was stated[1] that the intent is to allow all syscalls QEMU needs, and not have any loss of current functionality. Hence I'm reporting all syscalls that are missing that QEMU needs. Daniel [1] https://lists.gnu.org/archive/html/qemu-devel/2012-05/msg00928.html
On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote: > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: > > I think allowing execve() would render seccomp pretty much useless. > > Not necessarily. > > I'll agree that it does seem a bit odd to allow execve(), but there is still > value in enabling seccomp to disable potentially buggy/exploitable syscalls. > Let's not forget that we have over 300 syscalls on x86_64, not including the > 32 bit versions, and even if we add all of the new syscalls suggested in this > thread we are still talking about a small subset of syscalls. As far as > security goes, the old adage of "less is more" applies. I can sort of see this argument, but *only* if the QEMU process is being run under a dedicated, fully unprivileged (from a DAC pov) user, completely separate from anything else on the system. If QEMU were being run as root, then even with seccomp, it could trivially just overwrite some binary in /bin, update /proc/core-pattern to point to this binary, and then crash itself. Now that core handling binary will execute without any of the seccomp filters applied. Similarly if QEMU is being run in the user's desktop session, I'm sure there is some kind of similar attack possible by changing a config setting for the user's GNOME/KDE session, and then waiting for GNOME/KDE to execute the script that QEMU just wrote out, once again bypassing seccomp. Regards, Daniel
On Fri, Jun 15, 2012 at 07:04:45PM +0000, Blue Swirl wrote: > On Wed, Jun 13, 2012 at 8:33 PM, Daniel P. Berrange <berrange@redhat.com> wrote: > > On Wed, Jun 13, 2012 at 07:56:06PM +0000, Blue Swirl wrote: > >> On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo <otubo@linux.vnet.ibm.com> wrote: > >> > I added a syscall struct using priority levels as described in the > >> > libseccomp man page. The priority numbers are based to the frequency > >> > they appear in a sample strace from a regular qemu guest run under > >> > libvirt. > >> > > >> > Libseccomp generates linear BPF code to filter system calls, those rules > >> > are read one after another. The priority system places the most common > >> > rules first in order to reduce the overhead when processing them. > >> > > >> > Also, since this is just a first RFC, the whitelist is a little raw. We > >> > might need your help to improve, test and fine tune the set of system > >> > calls. > >> > > >> > v2: Fixed some style issues > >> > Removed code from vl.c and created qemu-seccomp.[ch] > >> > Now using ARRAY_SIZE macro > >> > Added more syscalls without priority/frequency set yet > >> > > >> > Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com> > >> > --- > >> > qemu-seccomp.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > >> > qemu-seccomp.h | 9 +++++++ > >> > vl.c | 7 ++++++ > >> > 3 files changed, 89 insertions(+) > >> > create mode 100644 qemu-seccomp.c > >> > create mode 100644 qemu-seccomp.h > >> > > >> > diff --git a/qemu-seccomp.c b/qemu-seccomp.c > >> > new file mode 100644 > >> > index 0000000..048b7ba > >> > --- /dev/null > >> > +++ b/qemu-seccomp.c > >> > @@ -0,0 +1,73 @@ > >> > >> Copyright and license info missing. > >> > >> > +#include <stdio.h> > >> > +#include <seccomp.h> > >> > +#include "qemu-seccomp.h" > >> > + > >> > +static struct QemuSeccompSyscall seccomp_whitelist[] = { > >> > >> 'const' > >> > >> > + { SCMP_SYS(timer_settime), 255 }, > >> > + { SCMP_SYS(timer_gettime), 254 }, > >> > + { SCMP_SYS(futex), 253 }, > >> > + { SCMP_SYS(select), 252 }, > >> > + { SCMP_SYS(recvfrom), 251 }, > >> > + { SCMP_SYS(sendto), 250 }, > >> > + { SCMP_SYS(read), 249 }, > >> > + { SCMP_SYS(brk), 248 }, > >> > + { SCMP_SYS(clone), 247 }, > >> > + { SCMP_SYS(mmap), 247 }, > >> > + { SCMP_SYS(mprotect), 246 }, > >> > + { SCMP_SYS(ioctl), 245 }, > >> > + { SCMP_SYS(recvmsg), 245 }, > >> > + { SCMP_SYS(sendmsg), 245 }, > >> > + { SCMP_SYS(accept), 245 }, > >> > + { SCMP_SYS(connect), 245 }, > >> > + { SCMP_SYS(bind), 245 }, > >> > >> It would be nice to avoid connect() and bind(). Perhaps seccomp init > >> should be postponed to after all sockets have been created? > > > > If you want to migrate your guest, you need to be able to > > call connect() at an arbitrary point in the QEMU process' > > lifecycle. So you can't avoid allowing connect(). Similarly > > if you want to allow hotplug of NICs (and their backends) > > then you need to have both bind() + connect() available. > > That's bad. Migration could conceivably be extended to use file > descriptor passing, but hotplug is more tricky. As with execve(), i'm reporting this on the basis that on the previous patch posting I was told we must whitelist any syscalls QEMU can conceivably use to avoid any loss in functionality. Daniel
On Mon, Jun 18, 2012 at 09:31:03AM +0100, Daniel P. Berrange wrote: > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote: > > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: > > > I think allowing execve() would render seccomp pretty much useless. > > > > Not necessarily. > > > > I'll agree that it does seem a bit odd to allow execve(), but there is still > > value in enabling seccomp to disable potentially buggy/exploitable syscalls. > > Let's not forget that we have over 300 syscalls on x86_64, not including the > > 32 bit versions, and even if we add all of the new syscalls suggested in this > > thread we are still talking about a small subset of syscalls. As far as > > security goes, the old adage of "less is more" applies. > > I can sort of see this argument, but *only* if the QEMU process is being > run under a dedicated, fully unprivileged (from a DAC pov) user, completely > separate from anything else on the system. Or, of course, for a QEMU already confined by SELinux. > If QEMU were being run as root, then even with seccomp, it could trivially > just overwrite some binary in /bin, update /proc/core-pattern to point to > this binary, and then crash itself. Now that core handling binary will > execute without any of the seccomp filters applied. > > Similarly if QEMU is being run in the user's desktop session, I'm sure there > is some kind of similar attack possible by changing a config setting for the > user's GNOME/KDE session, and then waiting for GNOME/KDE to execute the script > that QEMU just wrote out, once again bypassing seccomp. Daniel
On Monday, June 18, 2012 09:31:03 AM Daniel P. Berrange wrote: > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote: > > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: > > > I think allowing execve() would render seccomp pretty much useless. > > > > Not necessarily. > > > > I'll agree that it does seem a bit odd to allow execve(), but there is > > still value in enabling seccomp to disable potentially buggy/exploitable > > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not > > including the 32 bit versions, and even if we add all of the new syscalls > > suggested in this thread we are still talking about a small subset of > > syscalls. As far as security goes, the old adage of "less is more" > > applies. > > I can sort of see this argument, but *only* if the QEMU process is being > run under a dedicated, fully unprivileged (from a DAC pov) user, completely > separate from anything else on the system. > > Or, of course, for a QEMU already confined by SELinux. Agreed ... and considering at least one major distribution takes this approach it seems like reasonable functionality to me. Confining QEMU, either through DAC and/or MAC, when faced with potentially malicious guests is just good sense.
On Mon, Jun 18, 2012 at 09:52:44AM -0400, Paul Moore wrote: > On Monday, June 18, 2012 09:31:03 AM Daniel P. Berrange wrote: > > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote: > > > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: > > > > I think allowing execve() would render seccomp pretty much useless. > > > > > > Not necessarily. > > > > > > I'll agree that it does seem a bit odd to allow execve(), but there is > > > still value in enabling seccomp to disable potentially buggy/exploitable > > > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not > > > including the 32 bit versions, and even if we add all of the new syscalls > > > suggested in this thread we are still talking about a small subset of > > > syscalls. As far as security goes, the old adage of "less is more" > > > applies. > > > > I can sort of see this argument, but *only* if the QEMU process is being > > run under a dedicated, fully unprivileged (from a DAC pov) user, completely > > separate from anything else on the system. > > > > Or, of course, for a QEMU already confined by SELinux. > > Agreed ... and considering at least one major distribution takes this approach > it seems like reasonable functionality to me. Confining QEMU, either through > DAC and/or MAC, when faced with potentially malicious guests is just good > sense. Good, I'm not missing anything then. I'd suggest that future iterations of these patches explicitly mention the deployment scenarios in which this technology is able to offer increases security, and also describe the scenarios where it will not improve things. Regards, Daniel
On Monday, June 18, 2012 02:55:35 PM Daniel P. Berrange wrote: > On Mon, Jun 18, 2012 at 09:52:44AM -0400, Paul Moore wrote: > > On Monday, June 18, 2012 09:31:03 AM Daniel P. Berrange wrote: > > > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote: > > > > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: > > > > > I think allowing execve() would render seccomp pretty much useless. > > > > > > > > Not necessarily. > > > > > > > > I'll agree that it does seem a bit odd to allow execve(), but there is > > > > still value in enabling seccomp to disable potentially > > > > buggy/exploitable > > > > syscalls. Let's not forget that we have over 300 syscalls on x86_64, > > > > not > > > > including the 32 bit versions, and even if we add all of the new > > > > syscalls > > > > suggested in this thread we are still talking about a small subset of > > > > syscalls. As far as security goes, the old adage of "less is more" > > > > applies. > > > > > > I can sort of see this argument, but *only* if the QEMU process is being > > > run under a dedicated, fully unprivileged (from a DAC pov) user, > > > completely > > > separate from anything else on the system. > > > > > > Or, of course, for a QEMU already confined by SELinux. > > > > Agreed ... and considering at least one major distribution takes this > > approach it seems like reasonable functionality to me. Confining QEMU, > > either through DAC and/or MAC, when faced with potentially malicious > > guests is just good sense. > > Good, I'm not missing anything then. I'd suggest that future iterations > of these patches explicitly mention the deployment scenarios in which > this technology is able to offer increases security, and also describe > the scenarios where it will not improve things. Sounds like a reasonable request to me.
On 06/18/2012 04:33 AM, Daniel P. Berrange wrote: > On Fri, Jun 15, 2012 at 07:04:45PM +0000, Blue Swirl wrote: >> On Wed, Jun 13, 2012 at 8:33 PM, Daniel P. Berrange <berrange@redhat.com> wrote: >>> On Wed, Jun 13, 2012 at 07:56:06PM +0000, Blue Swirl wrote: >>>> On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo <otubo@linux.vnet.ibm.com> wrote: >>>>> I added a syscall struct using priority levels as described in the >>>>> libseccomp man page. The priority numbers are based to the frequency >>>>> they appear in a sample strace from a regular qemu guest run under >>>>> libvirt. >>>>> >>>>> Libseccomp generates linear BPF code to filter system calls, those rules >>>>> are read one after another. The priority system places the most common >>>>> rules first in order to reduce the overhead when processing them. >>>>> >>>>> Also, since this is just a first RFC, the whitelist is a little raw. We >>>>> might need your help to improve, test and fine tune the set of system >>>>> calls. >>>>> >>>>> v2: Fixed some style issues >>>>> Removed code from vl.c and created qemu-seccomp.[ch] >>>>> Now using ARRAY_SIZE macro >>>>> Added more syscalls without priority/frequency set yet >>>>> >>>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com> >>>>> --- >>>>> qemu-seccomp.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >>>>> qemu-seccomp.h | 9 +++++++ >>>>> vl.c | 7 ++++++ >>>>> 3 files changed, 89 insertions(+) >>>>> create mode 100644 qemu-seccomp.c >>>>> create mode 100644 qemu-seccomp.h >>>>> >>>>> diff --git a/qemu-seccomp.c b/qemu-seccomp.c >>>>> new file mode 100644 >>>>> index 0000000..048b7ba >>>>> --- /dev/null >>>>> +++ b/qemu-seccomp.c >>>>> @@ -0,0 +1,73 @@ >>>> >>>> Copyright and license info missing. >>>> >>>>> +#include <stdio.h> >>>>> +#include <seccomp.h> >>>>> +#include "qemu-seccomp.h" >>>>> + >>>>> +static struct QemuSeccompSyscall seccomp_whitelist[] = { >>>> >>>> 'const' >>>> >>>>> + { SCMP_SYS(timer_settime), 255 }, >>>>> + { SCMP_SYS(timer_gettime), 254 }, >>>>> + { SCMP_SYS(futex), 253 }, >>>>> + { SCMP_SYS(select), 252 }, >>>>> + { SCMP_SYS(recvfrom), 251 }, >>>>> + { SCMP_SYS(sendto), 250 }, >>>>> + { SCMP_SYS(read), 249 }, >>>>> + { SCMP_SYS(brk), 248 }, >>>>> + { SCMP_SYS(clone), 247 }, >>>>> + { SCMP_SYS(mmap), 247 }, >>>>> + { SCMP_SYS(mprotect), 246 }, >>>>> + { SCMP_SYS(ioctl), 245 }, >>>>> + { SCMP_SYS(recvmsg), 245 }, >>>>> + { SCMP_SYS(sendmsg), 245 }, >>>>> + { SCMP_SYS(accept), 245 }, >>>>> + { SCMP_SYS(connect), 245 }, >>>>> + { SCMP_SYS(bind), 245 }, >>>> >>>> It would be nice to avoid connect() and bind(). Perhaps seccomp init >>>> should be postponed to after all sockets have been created? >>> >>> If you want to migrate your guest, you need to be able to >>> call connect() at an arbitrary point in the QEMU process' >>> lifecycle. So you can't avoid allowing connect(). Similarly >>> if you want to allow hotplug of NICs (and their backends) >>> then you need to have both bind() + connect() available. >> >> That's bad. Migration could conceivably be extended to use file >> descriptor passing, but hotplug is more tricky. > > As with execve(), i'm reporting this on the basis that on the previous > patch posting I was told we must whitelist any syscalls QEMU can > conceivably use to avoid any loss in functionality. Thanks for pointing out syscalls needed for the whitelist. As Paul has already mentioned, it was recommended that we restrict all of QEMU (as a single process) from the start of execution. This is opposed to other options of restricting QEMU from the time that vCPUS start, further restricting based on syscall parms, or decomposing QEMU into multiple processes that are individually restricted with their own seccomp whitelists. I think this approach is a good starting point that can be further tuned in the future. And as with most security measures, defense in depth improves the cause (e.g. combining seccomp with DAC or MAC).
On 06/18/2012 04:31 AM, Daniel P. Berrange wrote: > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote: >> On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: >>> I think allowing execve() would render seccomp pretty much useless. >> >> Not necessarily. >> >> I'll agree that it does seem a bit odd to allow execve(), but there is still >> value in enabling seccomp to disable potentially buggy/exploitable syscalls. >> Let's not forget that we have over 300 syscalls on x86_64, not including the >> 32 bit versions, and even if we add all of the new syscalls suggested in this >> thread we are still talking about a small subset of syscalls. As far as >> security goes, the old adage of "less is more" applies. > > I can sort of see this argument, but *only* if the QEMU process is being > run under a dedicated, fully unprivileged (from a DAC pov) user, completely > separate from anything else on the system. This might be a good point to plug Marcelo Cerri's DAC isolation patches that are on the libvirt mailing list. :) http://www.redhat.com/archives/libvir-list/2012-May/msg01005.html http://www.redhat.com/archives/libvir-list/2012-June/msg00020.html
On 06/16/2012 02:46 AM, Blue Swirl wrote: > On Fri, Jun 15, 2012 at 9:36 PM, Paul Moore <pmoore@redhat.com> wrote: >> On Friday, June 15, 2012 09:23:46 PM Blue Swirl wrote: >>> On Fri, Jun 15, 2012 at 9:02 PM, Paul Moore <pmoore@redhat.com> wrote: >>>> On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: >>>>> I think allowing execve() would render seccomp pretty much useless. >>>> >>>> Not necessarily. >>>> >>>> I'll agree that it does seem a bit odd to allow execve(), but there is >>>> still value in enabling seccomp to disable potentially buggy/exploitable >>>> syscalls. Let's not forget that we have over 300 syscalls on x86_64, not >>>> including the 32 bit versions, and even if we add all of the new syscalls >>>> suggested in this thread we are still talking about a small subset of >>>> syscalls. As far as security goes, the old adage of "less is more" >>>> applies. >>> >>> The helper program being executed could need any of the 300 system >>> calls, so we'd have to allow all. >> >> Don't we have some basic understanding of what the applications being exec'd >> will need to do? I sorta see your point, but allowing the entire set of >> syscalls seems a bit dramatic. > > At least qemu-ifup/down scripts, migration exec and smbd have been > mentioned. Only the system calls made by smbd (for some version of it) > can be known. The user could specify arbitrary commands for the > others, those could be assumed to use some common (large) subset of > system calls but I think the security value would be close to zero > then. > I think at some point in the future we'll want to implement seccomp whitelists for each process that QEMU exec's. The bridge helper (-net bridge) is one that I'm familiar with. It is fairly minimal in the syscalls that it requires. It certainly doesn't need all the syscalls that QEMU needs. It is already severely restricted by SELinux (and AppArmor soon), but restricting the syscall footprint would be a nice addition.
On Mon, Jun 18, 2012 at 02:55:35PM +0100, Daniel P. Berrange wrote: > On Mon, Jun 18, 2012 at 09:52:44AM -0400, Paul Moore wrote: > > On Monday, June 18, 2012 09:31:03 AM Daniel P. Berrange wrote: > > > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote: > > > > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: > > > > > I think allowing execve() would render seccomp pretty much useless. > > > > > > > > Not necessarily. > > > > > > > > I'll agree that it does seem a bit odd to allow execve(), but there is > > > > still value in enabling seccomp to disable potentially buggy/exploitable > > > > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not > > > > including the 32 bit versions, and even if we add all of the new syscalls > > > > suggested in this thread we are still talking about a small subset of > > > > syscalls. As far as security goes, the old adage of "less is more" > > > > applies. > > > > > > I can sort of see this argument, but *only* if the QEMU process is being > > > run under a dedicated, fully unprivileged (from a DAC pov) user, completely > > > separate from anything else on the system. > > > > > > Or, of course, for a QEMU already confined by SELinux. > > > > Agreed ... and considering at least one major distribution takes this approach > > it seems like reasonable functionality to me. Confining QEMU, either through > > DAC and/or MAC, when faced with potentially malicious guests is just good > > sense. > > Good, I'm not missing anything then. I'd suggest that future iterations > of these patches explicitly mention the deployment scenarios in which > this technology is able to offer increases security, and also describe > the scenarios where it will not improve things. Please correct me if I'm wrong here, but I don't understand how exactly whitelisting execve() is odd. The white list is inherit and passed along the child processes so they also need to have their own syscalls filtered by BPF in the kernel as stated in the Will's commit log[1] - "Filter programs will be inherited across fork/clone and execve." - I wonder if this is main point of your concern. Whitelisting execve() or not should be no difference from the security pov. However, I agree that a possible future feature could a customized whitelist for each child process spawned. But for a first instance, the default whitelist should be enough to start seccomp support in Qemu. Also, as far as I understand, seccomp never meant to replace any of the technologies above mentioned. Using more than one layer of protection (SELinux, AppArmor MAC policy and/or DAC) should always be a good practice for the defense in depth. [1] - http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=e2cfabdfd075648216f99c2c03821cf3f47c1727
On Mon, Jun 18, 2012 at 8:31 AM, Daniel P. Berrange <berrange@redhat.com> wrote: > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote: >> On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: >> > I think allowing execve() would render seccomp pretty much useless. >> >> Not necessarily. >> >> I'll agree that it does seem a bit odd to allow execve(), but there is still >> value in enabling seccomp to disable potentially buggy/exploitable syscalls. >> Let's not forget that we have over 300 syscalls on x86_64, not including the >> 32 bit versions, and even if we add all of the new syscalls suggested in this >> thread we are still talking about a small subset of syscalls. As far as >> security goes, the old adage of "less is more" applies. > > I can sort of see this argument, but *only* if the QEMU process is being > run under a dedicated, fully unprivileged (from a DAC pov) user, completely > separate from anything else on the system. > > If QEMU were being run as root, then even with seccomp, it could trivially > just overwrite some binary in /bin, update /proc/core-pattern to point to Not wiithout 'open'. When run as root, it would be nice to chroot() also to some empty directory and then drop chroot() privileges. > this binary, and then crash itself. Now that core handling binary will > execute without any of the seccomp filters applied. > > Similarly if QEMU is being run in the user's desktop session, I'm sure there > is some kind of similar attack possible by changing a config setting for the > user's GNOME/KDE session, and then waiting for GNOME/KDE to execute the script > that QEMU just wrote out, once again bypassing seccomp. > > Regards, > Daniel > -- > |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| > |: http://libvirt.org -o- http://virt-manager.org :| > |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| > |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
On Mon, Jun 18, 2012 at 3:22 PM, Corey Bryant <coreyb@linux.vnet.ibm.com> wrote: > > > On 06/18/2012 04:33 AM, Daniel P. Berrange wrote: >> >> On Fri, Jun 15, 2012 at 07:04:45PM +0000, Blue Swirl wrote: >>> >>> On Wed, Jun 13, 2012 at 8:33 PM, Daniel P. Berrange <berrange@redhat.com> >>> wrote: >>>> >>>> On Wed, Jun 13, 2012 at 07:56:06PM +0000, Blue Swirl wrote: >>>>> >>>>> On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo >>>>> <otubo@linux.vnet.ibm.com> wrote: >>>>>> >>>>>> I added a syscall struct using priority levels as described in the >>>>>> libseccomp man page. The priority numbers are based to the frequency >>>>>> they appear in a sample strace from a regular qemu guest run under >>>>>> libvirt. >>>>>> >>>>>> Libseccomp generates linear BPF code to filter system calls, those >>>>>> rules >>>>>> are read one after another. The priority system places the most common >>>>>> rules first in order to reduce the overhead when processing them. >>>>>> >>>>>> Also, since this is just a first RFC, the whitelist is a little raw. >>>>>> We >>>>>> might need your help to improve, test and fine tune the set of system >>>>>> calls. >>>>>> >>>>>> v2: Fixed some style issues >>>>>> Removed code from vl.c and created qemu-seccomp.[ch] >>>>>> Now using ARRAY_SIZE macro >>>>>> Added more syscalls without priority/frequency set yet >>>>>> >>>>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com> >>>>>> --- >>>>>> qemu-seccomp.c | 73 >>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >>>>>> qemu-seccomp.h | 9 +++++++ >>>>>> vl.c | 7 ++++++ >>>>>> 3 files changed, 89 insertions(+) >>>>>> create mode 100644 qemu-seccomp.c >>>>>> create mode 100644 qemu-seccomp.h >>>>>> >>>>>> diff --git a/qemu-seccomp.c b/qemu-seccomp.c >>>>>> new file mode 100644 >>>>>> index 0000000..048b7ba >>>>>> --- /dev/null >>>>>> +++ b/qemu-seccomp.c >>>>>> @@ -0,0 +1,73 @@ >>>>> >>>>> >>>>> Copyright and license info missing. >>>>> >>>>>> +#include <stdio.h> >>>>>> +#include <seccomp.h> >>>>>> +#include "qemu-seccomp.h" >>>>>> + >>>>>> +static struct QemuSeccompSyscall seccomp_whitelist[] = { >>>>> >>>>> >>>>> 'const' >>>>> >>>>>> + { SCMP_SYS(timer_settime), 255 }, >>>>>> + { SCMP_SYS(timer_gettime), 254 }, >>>>>> + { SCMP_SYS(futex), 253 }, >>>>>> + { SCMP_SYS(select), 252 }, >>>>>> + { SCMP_SYS(recvfrom), 251 }, >>>>>> + { SCMP_SYS(sendto), 250 }, >>>>>> + { SCMP_SYS(read), 249 }, >>>>>> + { SCMP_SYS(brk), 248 }, >>>>>> + { SCMP_SYS(clone), 247 }, >>>>>> + { SCMP_SYS(mmap), 247 }, >>>>>> + { SCMP_SYS(mprotect), 246 }, >>>>>> + { SCMP_SYS(ioctl), 245 }, >>>>>> + { SCMP_SYS(recvmsg), 245 }, >>>>>> + { SCMP_SYS(sendmsg), 245 }, >>>>>> + { SCMP_SYS(accept), 245 }, >>>>>> + { SCMP_SYS(connect), 245 }, >>>>>> + { SCMP_SYS(bind), 245 }, >>>>> >>>>> >>>>> It would be nice to avoid connect() and bind(). Perhaps seccomp init >>>>> should be postponed to after all sockets have been created? >>>> >>>> >>>> If you want to migrate your guest, you need to be able to >>>> call connect() at an arbitrary point in the QEMU process' >>>> lifecycle. So you can't avoid allowing connect(). Similarly >>>> if you want to allow hotplug of NICs (and their backends) >>>> then you need to have both bind() + connect() available. >>> >>> >>> That's bad. Migration could conceivably be extended to use file >>> descriptor passing, but hotplug is more tricky. >> >> >> As with execve(), i'm reporting this on the basis that on the previous >> patch posting I was told we must whitelist any syscalls QEMU can >> conceivably use to avoid any loss in functionality. > > > Thanks for pointing out syscalls needed for the whitelist. > > As Paul has already mentioned, it was recommended that we restrict all of > QEMU (as a single process) from the start of execution. This is opposed to > other options of restricting QEMU from the time that vCPUS start, further > restricting based on syscall parms, or decomposing QEMU into multiple > processes that are individually restricted with their own seccomp > whitelists. Can each thread have separate seccomp whitelists? For example CPU threads should not need pretty much anything but the I/O thread needs I/O. > I think this approach is a good starting point that can be further tuned in > the future. And as with most security measures, defense in depth improves > the cause (e.g. combining seccomp with DAC or MAC). Agreed. > > -- > Regards, > Corey > >
On Mon, Jun 18, 2012 at 8:13 PM, Eduardo Otubo <otubo@linux.vnet.ibm.com> wrote: > On Mon, Jun 18, 2012 at 02:55:35PM +0100, Daniel P. Berrange wrote: >> On Mon, Jun 18, 2012 at 09:52:44AM -0400, Paul Moore wrote: >> > On Monday, June 18, 2012 09:31:03 AM Daniel P. Berrange wrote: >> > > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote: >> > > > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: >> > > > > I think allowing execve() would render seccomp pretty much useless. >> > > > >> > > > Not necessarily. >> > > > >> > > > I'll agree that it does seem a bit odd to allow execve(), but there is >> > > > still value in enabling seccomp to disable potentially buggy/exploitable >> > > > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not >> > > > including the 32 bit versions, and even if we add all of the new syscalls >> > > > suggested in this thread we are still talking about a small subset of >> > > > syscalls. As far as security goes, the old adage of "less is more" >> > > > applies. >> > > >> > > I can sort of see this argument, but *only* if the QEMU process is being >> > > run under a dedicated, fully unprivileged (from a DAC pov) user, completely >> > > separate from anything else on the system. >> > > >> > > Or, of course, for a QEMU already confined by SELinux. >> > >> > Agreed ... and considering at least one major distribution takes this approach >> > it seems like reasonable functionality to me. Confining QEMU, either through >> > DAC and/or MAC, when faced with potentially malicious guests is just good >> > sense. >> >> Good, I'm not missing anything then. I'd suggest that future iterations >> of these patches explicitly mention the deployment scenarios in which >> this technology is able to offer increases security, and also describe >> the scenarios where it will not improve things. > > Please correct me if I'm wrong here, but I don't understand how exactly > whitelisting execve() is odd. The white list is inherit and passed along > the child processes so they also need to have their own syscalls filtered > by BPF in the kernel as stated in the Will's commit log[1] - "Filter > programs will be inherited across fork/clone and execve." - I wonder if > this is main point of your concern. Whitelisting execve() or not should be > no difference from the security pov. The helper might behave strangely (read: according to attacker's wishes) when some of the required syscalls are not working. > However, I agree that a possible future feature could a customized > whitelist for each child process spawned. But for a first instance, the > default whitelist should be enough to start seccomp support in Qemu. It's a good way. I just wish the list weren't so open and lame to begin with. > > Also, as far as I understand, seccomp never meant to replace any of the > technologies above mentioned. Using more than one layer of protection > (SELinux, AppArmor MAC policy and/or DAC) should always be a good practice > for the defense in depth. > > [1] - > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=e2cfabdfd075648216f99c2c03821cf3f47c1727 > > -- > Eduardo Otubo > Software Engineer > Linux Technology Center > IBM Systems & Technology Group > Mobile: +55 19 8135 0885 > eotubo@linux.vnet.ibm.com >
On 06/18/2012 04:18 PM, Blue Swirl wrote: > On Mon, Jun 18, 2012 at 3:22 PM, Corey Bryant <coreyb@linux.vnet.ibm.com> wrote: >> >> >> On 06/18/2012 04:33 AM, Daniel P. Berrange wrote: >>> >>> On Fri, Jun 15, 2012 at 07:04:45PM +0000, Blue Swirl wrote: >>>> >>>> On Wed, Jun 13, 2012 at 8:33 PM, Daniel P. Berrange <berrange@redhat.com> >>>> wrote: >>>>> >>>>> On Wed, Jun 13, 2012 at 07:56:06PM +0000, Blue Swirl wrote: >>>>>> >>>>>> On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo >>>>>> <otubo@linux.vnet.ibm.com> wrote: >>>>>>> >>>>>>> I added a syscall struct using priority levels as described in the >>>>>>> libseccomp man page. The priority numbers are based to the frequency >>>>>>> they appear in a sample strace from a regular qemu guest run under >>>>>>> libvirt. >>>>>>> >>>>>>> Libseccomp generates linear BPF code to filter system calls, those >>>>>>> rules >>>>>>> are read one after another. The priority system places the most common >>>>>>> rules first in order to reduce the overhead when processing them. >>>>>>> >>>>>>> Also, since this is just a first RFC, the whitelist is a little raw. >>>>>>> We >>>>>>> might need your help to improve, test and fine tune the set of system >>>>>>> calls. >>>>>>> >>>>>>> v2: Fixed some style issues >>>>>>> Removed code from vl.c and created qemu-seccomp.[ch] >>>>>>> Now using ARRAY_SIZE macro >>>>>>> Added more syscalls without priority/frequency set yet >>>>>>> >>>>>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com> >>>>>>> --- >>>>>>> qemu-seccomp.c | 73 >>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >>>>>>> qemu-seccomp.h | 9 +++++++ >>>>>>> vl.c | 7 ++++++ >>>>>>> 3 files changed, 89 insertions(+) >>>>>>> create mode 100644 qemu-seccomp.c >>>>>>> create mode 100644 qemu-seccomp.h >>>>>>> >>>>>>> diff --git a/qemu-seccomp.c b/qemu-seccomp.c >>>>>>> new file mode 100644 >>>>>>> index 0000000..048b7ba >>>>>>> --- /dev/null >>>>>>> +++ b/qemu-seccomp.c >>>>>>> @@ -0,0 +1,73 @@ >>>>>> >>>>>> >>>>>> Copyright and license info missing. >>>>>> >>>>>>> +#include <stdio.h> >>>>>>> +#include <seccomp.h> >>>>>>> +#include "qemu-seccomp.h" >>>>>>> + >>>>>>> +static struct QemuSeccompSyscall seccomp_whitelist[] = { >>>>>> >>>>>> >>>>>> 'const' >>>>>> >>>>>>> + { SCMP_SYS(timer_settime), 255 }, >>>>>>> + { SCMP_SYS(timer_gettime), 254 }, >>>>>>> + { SCMP_SYS(futex), 253 }, >>>>>>> + { SCMP_SYS(select), 252 }, >>>>>>> + { SCMP_SYS(recvfrom), 251 }, >>>>>>> + { SCMP_SYS(sendto), 250 }, >>>>>>> + { SCMP_SYS(read), 249 }, >>>>>>> + { SCMP_SYS(brk), 248 }, >>>>>>> + { SCMP_SYS(clone), 247 }, >>>>>>> + { SCMP_SYS(mmap), 247 }, >>>>>>> + { SCMP_SYS(mprotect), 246 }, >>>>>>> + { SCMP_SYS(ioctl), 245 }, >>>>>>> + { SCMP_SYS(recvmsg), 245 }, >>>>>>> + { SCMP_SYS(sendmsg), 245 }, >>>>>>> + { SCMP_SYS(accept), 245 }, >>>>>>> + { SCMP_SYS(connect), 245 }, >>>>>>> + { SCMP_SYS(bind), 245 }, >>>>>> >>>>>> >>>>>> It would be nice to avoid connect() and bind(). Perhaps seccomp init >>>>>> should be postponed to after all sockets have been created? >>>>> >>>>> >>>>> If you want to migrate your guest, you need to be able to >>>>> call connect() at an arbitrary point in the QEMU process' >>>>> lifecycle. So you can't avoid allowing connect(). Similarly >>>>> if you want to allow hotplug of NICs (and their backends) >>>>> then you need to have both bind() + connect() available. >>>> >>>> >>>> That's bad. Migration could conceivably be extended to use file >>>> descriptor passing, but hotplug is more tricky. >>> >>> >>> As with execve(), i'm reporting this on the basis that on the previous >>> patch posting I was told we must whitelist any syscalls QEMU can >>> conceivably use to avoid any loss in functionality. >> >> >> Thanks for pointing out syscalls needed for the whitelist. >> >> As Paul has already mentioned, it was recommended that we restrict all of >> QEMU (as a single process) from the start of execution. This is opposed to >> other options of restricting QEMU from the time that vCPUS start, further >> restricting based on syscall parms, or decomposing QEMU into multiple >> processes that are individually restricted with their own seccomp >> whitelists. > > Can each thread have separate seccomp whitelists? For example CPU > threads should not need pretty much anything but the I/O thread needs > I/O. > No, seccomp filters are defined and enforced at the process level.
On Mon, Jun 18, 2012 at 08:15:37PM +0000, Blue Swirl wrote: > On Mon, Jun 18, 2012 at 8:31 AM, Daniel P. Berrange <berrange@redhat.com> wrote: > > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote: > >> On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: > >> > I think allowing execve() would render seccomp pretty much useless. > >> > >> Not necessarily. > >> > >> I'll agree that it does seem a bit odd to allow execve(), but there is still > >> value in enabling seccomp to disable potentially buggy/exploitable syscalls. > >> Let's not forget that we have over 300 syscalls on x86_64, not including the > >> 32 bit versions, and even if we add all of the new syscalls suggested in this > >> thread we are still talking about a small subset of syscalls. As far as > >> security goes, the old adage of "less is more" applies. > > > > I can sort of see this argument, but *only* if the QEMU process is being > > run under a dedicated, fully unprivileged (from a DAC pov) user, completely > > separate from anything else on the system. > > > > If QEMU were being run as root, then even with seccomp, it could trivially > > just overwrite some binary in /bin, update /proc/core-pattern to point to > > Not wiithout 'open'. When run as root, it would be nice to chroot() > also to some empty directory and then drop chroot() privileges. That's just another example of my point, that adding seccomp alone does nothing for QEMU security. It is only valuable when combined with another security technique, be it per-user DAC separation, SELinux MAC, or chroot, or splitting QEMU into multiple separate processes, or using Linux containers to confine it, etc Daniel
On 06/16/2012 09:46 AM, Blue Swirl wrote: > On Fri, Jun 15, 2012 at 9:36 PM, Paul Moore <pmoore@redhat.com> wrote: >> On Friday, June 15, 2012 09:23:46 PM Blue Swirl wrote: >>> On Fri, Jun 15, 2012 at 9:02 PM, Paul Moore <pmoore@redhat.com> wrote: >>> > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: >>> >> I think allowing execve() would render seccomp pretty much useless. >>> > >>> > Not necessarily. >>> > >>> > I'll agree that it does seem a bit odd to allow execve(), but there is >>> > still value in enabling seccomp to disable potentially buggy/exploitable >>> > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not >>> > including the 32 bit versions, and even if we add all of the new syscalls >>> > suggested in this thread we are still talking about a small subset of >>> > syscalls. As far as security goes, the old adage of "less is more" >>> > applies. >>> >>> The helper program being executed could need any of the 300 system >>> calls, so we'd have to allow all. >> >> Don't we have some basic understanding of what the applications being exec'd >> will need to do? I sorta see your point, but allowing the entire set of >> syscalls seems a bit dramatic. > > At least qemu-ifup/down scripts, migration exec and smbd have been > mentioned. Only the system calls made by smbd (for some version of it) > can be known. The user could specify arbitrary commands for the > others, those could be assumed to use some common (large) subset of > system calls but I think the security value would be close to zero > then. We're not trying to protect against the user, but against the guest. If we assume the user wrote those scripts with care so they cannot be exploited by the guest, then we are okay. However I agree with you that it would be better to restrict those syscalls. The scripts are already unnecessary if using a management system and migration supports passed file descriptors, so that leaves only smbd, which can probably be pre-execed. > >> >>> > Protecting against the abuse and misuse of execve() is something that is >>> > better done with the host's access controls (traditional DAC, MAC via the >>> > LSM, etc.). >>> >>> How about seccomp mode selected by command line switch -seccomp, in >>> which bind/connect/open/execve are forbidden? The functionality >>> remaining would be somewhat limited (can't migrate or use SMB etc. >>> until refactoring of QEMU), but that way seccomp jail would be much >>> tighter. >> >> When I spoke to Anthony about this earlier (offline, sorry) he was opposed to >> requiring any switches or user interaction to enable seccomp. I'm not sure if >> his stance on this has changed any over the past few months. > > There could be two modes, strict mode (-seccomp) and default mode > (only some syscalls blocked). With the future decomposed QEMU, strict > seccomp mode would be default and the switch would be obsoleted. If > the decomposition is planned to happen soonish, adding the switch > would be just churn. We have decomposed qemu to some extent, in that privileged operations happen in libvirt. So the modes make sense - qemu has no idea whether a privileged management system is controlling it or not. > >> >> In my perfect world, we would have a decomposed QEMU that functions as a >> series of processes connected via some sort of IPC; the exact divisions are a >> bit TBD and beyond the scope of this discussion. In this scenario we would be >> able to restrict QEMU with sVirt and seccomp to a much higher degree than we >> could with the current monolithic QEMU. >> >> I don't expect to see my perfect world any time soon, but in the meantime we >> can still improve the security of QEMU on Linux with these seccomp patches and >> for that reason I think it's a win. Since these patches don't expose anything >> at runtime (no knobs, switches, etc.) we leave ourselves plenty of flexibility >> for changing things in the future. > > Yes, I'm much in favor of adding seccomp support soon. But I just > wonder if this is really the best level of security we can reach now, > not assuming decomposed QEMU, but just minor tweaks? We might disable mprotect(PROT_EXEC) if running with kvm.
On 06/19/2012 11:37 AM, Will Drewry wrote: > On Tue, Jun 19, 2012 at 8:35 AM, Corey Bryant <coreyb@linux.vnet.ibm.com> wrote: >> >> >> On 06/18/2012 06:14 PM, Will Drewry wrote: >>> >>> [-all] >>> >>> On Mon, Jun 18, 2012 at 4:53 PM, Corey Bryant <coreyb@linux.vnet.ibm.com> >>> wrote: >>>> >>>> >>>> >>>> On 06/18/2012 04:18 PM, Blue Swirl wrote: >>>>> >>>>> >>>>> On Mon, Jun 18, 2012 at 3:22 PM, Corey Bryant >>>>> <coreyb@linux.vnet.ibm.com> >>>>> wrote: >>>>>> >>>>>> >>>>>> >>>>>> >>>>>> On 06/18/2012 04:33 AM, Daniel P. Berrange wrote: >>>>>>> >>>>>>> >>>>>>> >>>>>>> On Fri, Jun 15, 2012 at 07:04:45PM +0000, Blue Swirl wrote: >>>>>>>> >>>>>>>> >>>>>>>> >>>>>>>> On Wed, Jun 13, 2012 at 8:33 PM, Daniel P. Berrange >>>>>>>> <berrange@redhat.com> >>>>>>>> wrote: >>>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>> On Wed, Jun 13, 2012 at 07:56:06PM +0000, Blue Swirl wrote: >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo >>>>>>>>>> <otubo@linux.vnet.ibm.com> wrote: >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> I added a syscall struct using priority levels as described in the >>>>>>>>>>> libseccomp man page. The priority numbers are based to the >>>>>>>>>>> frequency >>>>>>>>>>> they appear in a sample strace from a regular qemu guest run under >>>>>>>>>>> libvirt. >>>>>>>>>>> >>>>>>>>>>> Libseccomp generates linear BPF code to filter system calls, those >>>>>>>>>>> rules >>>>>>>>>>> are read one after another. The priority system places the most >>>>>>>>>>> common >>>>>>>>>>> rules first in order to reduce the overhead when processing them. >>>>>>>>>>> >>>>>>>>>>> Also, since this is just a first RFC, the whitelist is a little >>>>>>>>>>> raw. >>>>>>>>>>> We >>>>>>>>>>> might need your help to improve, test and fine tune the set of >>>>>>>>>>> system >>>>>>>>>>> calls. >>>>>>>>>>> >>>>>>>>>>> v2: Fixed some style issues >>>>>>>>>>> Removed code from vl.c and created qemu-seccomp.[ch] >>>>>>>>>>> Now using ARRAY_SIZE macro >>>>>>>>>>> Added more syscalls without priority/frequency set yet >>>>>>>>>>> >>>>>>>>>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com> >>>>>>>>>>> --- >>>>>>>>>>> qemu-seccomp.c | 73 >>>>>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >>>>>>>>>>> qemu-seccomp.h | 9 +++++++ >>>>>>>>>>> vl.c | 7 ++++++ >>>>>>>>>>> 3 files changed, 89 insertions(+) >>>>>>>>>>> create mode 100644 qemu-seccomp.c >>>>>>>>>>> create mode 100644 qemu-seccomp.h >>>>>>>>>>> >>>>>>>>>>> diff --git a/qemu-seccomp.c b/qemu-seccomp.c >>>>>>>>>>> new file mode 100644 >>>>>>>>>>> index 0000000..048b7ba >>>>>>>>>>> --- /dev/null >>>>>>>>>>> +++ b/qemu-seccomp.c >>>>>>>>>>> @@ -0,0 +1,73 @@ >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> Copyright and license info missing. >>>>>>>>>> >>>>>>>>>>> +#include <stdio.h> >>>>>>>>>>> +#include <seccomp.h> >>>>>>>>>>> +#include "qemu-seccomp.h" >>>>>>>>>>> + >>>>>>>>>>> +static struct QemuSeccompSyscall seccomp_whitelist[] = { >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> 'const' >>>>>>>>>> >>>>>>>>>>> + { SCMP_SYS(timer_settime), 255 }, >>>>>>>>>>> + { SCMP_SYS(timer_gettime), 254 }, >>>>>>>>>>> + { SCMP_SYS(futex), 253 }, >>>>>>>>>>> + { SCMP_SYS(select), 252 }, >>>>>>>>>>> + { SCMP_SYS(recvfrom), 251 }, >>>>>>>>>>> + { SCMP_SYS(sendto), 250 }, >>>>>>>>>>> + { SCMP_SYS(read), 249 }, >>>>>>>>>>> + { SCMP_SYS(brk), 248 }, >>>>>>>>>>> + { SCMP_SYS(clone), 247 }, >>>>>>>>>>> + { SCMP_SYS(mmap), 247 }, >>>>>>>>>>> + { SCMP_SYS(mprotect), 246 }, >>>>>>>>>>> + { SCMP_SYS(ioctl), 245 }, >>>>>>>>>>> + { SCMP_SYS(recvmsg), 245 }, >>>>>>>>>>> + { SCMP_SYS(sendmsg), 245 }, >>>>>>>>>>> + { SCMP_SYS(accept), 245 }, >>>>>>>>>>> + { SCMP_SYS(connect), 245 }, >>>>>>>>>>> + { SCMP_SYS(bind), 245 }, >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> >>>>>>>>>> It would be nice to avoid connect() and bind(). Perhaps seccomp >>>>>>>>>> init >>>>>>>>>> should be postponed to after all sockets have been created? >>>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>> >>>>>>>>> If you want to migrate your guest, you need to be able to >>>>>>>>> call connect() at an arbitrary point in the QEMU process' >>>>>>>>> lifecycle. So you can't avoid allowing connect(). Similarly >>>>>>>>> if you want to allow hotplug of NICs (and their backends) >>>>>>>>> then you need to have both bind() + connect() available. >>>>>>>> >>>>>>>> >>>>>>>> >>>>>>>> >>>>>>>> That's bad. Migration could conceivably be extended to use file >>>>>>>> descriptor passing, but hotplug is more tricky. >>>>>>> >>>>>>> >>>>>>> >>>>>>> >>>>>>> As with execve(), i'm reporting this on the basis that on the previous >>>>>>> patch posting I was told we must whitelist any syscalls QEMU can >>>>>>> conceivably use to avoid any loss in functionality. >>>>>> >>>>>> >>>>>> >>>>>> >>>>>> Thanks for pointing out syscalls needed for the whitelist. >>>>>> >>>>>> As Paul has already mentioned, it was recommended that we restrict all >>>>>> of >>>>>> QEMU (as a single process) from the start of execution. This is >>>>>> opposed >>>>>> to >>>>>> other options of restricting QEMU from the time that vCPUS start, >>>>>> further >>>>>> restricting based on syscall parms, or decomposing QEMU into multiple >>>>>> processes that are individually restricted with their own seccomp >>>>>> whitelists. >>>>> >>>>> >>>>> >>>>> Can each thread have separate seccomp whitelists? For example CPU >>>>> threads should not need pretty much anything but the I/O thread needs >>>>> I/O. >>>>> >>>> >>>> No, seccomp filters are defined and enforced at the process level. >>> >>> >>> I'll keep lurking :) especially since I don't know the internals of >>> qemu well, but you can do per-thread seccomp filters since >>> processes==threads on linux. The real risk is that threads share so >>> much that an attack on the CPU thread may be able to parlay that into >>> a syscall proxy on a another thread. Probably what would make sense >>> in that way is a loose global filter, then have each sub-thread >>> install a functionality specific second filter. >>> >>> I may be way off base though, so feel free to just tell me to keep lurking >>> :) >>> >>> Thanks again for all the support and for pushing hard to get this >>> functionality in qemu! >> >> >> Please keep lurking! I appreciate the input and education. :) >> >> So whether it's a thread or process, I assume it will have its own a >> task_struct, allowing us to set a filter per thread or per process. The >> difference being that threads share more resources than processes. Sort of >> thinking out loud here to see if I'm right. > > Exactly! > >> It doesn't seem ideal vs process separation, but it's do-able. > > Yep -- so for something like qemu, you could install a global baseline > policy (e.g., union of all needed syscalls) then for each thread, they > can install a more restrictive set. The actual security guarantees > will be the total synthesis because of cross-thread attacks, but it > would make exploitation pretty painful. > > If you want better guarantees, then process separation is needed. One > option is even doing brokering for complex syscalls using either > ptrace or a sigsys handler, but that is likely too much to get into > while establishing a baseline. > In response to "Can each thread have separate seccomp whitelists?" please take a look at the thread above from Will Drewry. seccomp *can* be used per thread. However, it's not ideal vs per process seccomp filters.
On Tue, Jun 19, 2012 at 9:23 AM, Daniel P. Berrange <berrange@redhat.com> wrote: > On Mon, Jun 18, 2012 at 08:15:37PM +0000, Blue Swirl wrote: >> On Mon, Jun 18, 2012 at 8:31 AM, Daniel P. Berrange <berrange@redhat.com> wrote: >> > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote: >> >> On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: >> >> > I think allowing execve() would render seccomp pretty much useless. >> >> >> >> Not necessarily. >> >> >> >> I'll agree that it does seem a bit odd to allow execve(), but there is still >> >> value in enabling seccomp to disable potentially buggy/exploitable syscalls. >> >> Let's not forget that we have over 300 syscalls on x86_64, not including the >> >> 32 bit versions, and even if we add all of the new syscalls suggested in this >> >> thread we are still talking about a small subset of syscalls. As far as >> >> security goes, the old adage of "less is more" applies. >> > >> > I can sort of see this argument, but *only* if the QEMU process is being >> > run under a dedicated, fully unprivileged (from a DAC pov) user, completely >> > separate from anything else on the system. >> > >> > If QEMU were being run as root, then even with seccomp, it could trivially >> > just overwrite some binary in /bin, update /proc/core-pattern to point to >> >> Not wiithout 'open'. When run as root, it would be nice to chroot() >> also to some empty directory and then drop chroot() privileges. > > That's just another example of my point, that adding seccomp alone > does nothing for QEMU security. It is only valuable when combined > with another security technique, be it per-user DAC separation, > SELinux MAC, or chroot, or splitting QEMU into multiple separate > processes, or using Linux containers to confine it, etc I think seccomp with some changes to either functionality or with minor refactoring could do a lot more than with just 'accept all' white list. But as a starting point, this is sort of OK. > > > Daniel > -- > |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| > |: http://libvirt.org -o- http://virt-manager.org :| > |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| > |: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
On Tue, Jun 19, 2012 at 11:04 AM, Avi Kivity <avi@redhat.com> wrote: > On 06/16/2012 09:46 AM, Blue Swirl wrote: >> On Fri, Jun 15, 2012 at 9:36 PM, Paul Moore <pmoore@redhat.com> wrote: >>> On Friday, June 15, 2012 09:23:46 PM Blue Swirl wrote: >>>> On Fri, Jun 15, 2012 at 9:02 PM, Paul Moore <pmoore@redhat.com> wrote: >>>> > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote: >>>> >> I think allowing execve() would render seccomp pretty much useless. >>>> > >>>> > Not necessarily. >>>> > >>>> > I'll agree that it does seem a bit odd to allow execve(), but there is >>>> > still value in enabling seccomp to disable potentially buggy/exploitable >>>> > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not >>>> > including the 32 bit versions, and even if we add all of the new syscalls >>>> > suggested in this thread we are still talking about a small subset of >>>> > syscalls. As far as security goes, the old adage of "less is more" >>>> > applies. >>>> >>>> The helper program being executed could need any of the 300 system >>>> calls, so we'd have to allow all. >>> >>> Don't we have some basic understanding of what the applications being exec'd >>> will need to do? I sorta see your point, but allowing the entire set of >>> syscalls seems a bit dramatic. >> >> At least qemu-ifup/down scripts, migration exec and smbd have been >> mentioned. Only the system calls made by smbd (for some version of it) >> can be known. The user could specify arbitrary commands for the >> others, those could be assumed to use some common (large) subset of >> system calls but I think the security value would be close to zero >> then. > > We're not trying to protect against the user, but against the guest. If > we assume the user wrote those scripts with care so they cannot be > exploited by the guest, then we are okay. My concern was that first we could accidentally filter a system call that changes the script or executable behavior, much like sendmail + capabilities bug, and then a guest could trigger running this script/executable and exploit the changed behavior. > > However I agree with you that it would be better to restrict those > syscalls. The scripts are already unnecessary if using a management > system and migration supports passed file descriptors, so that leaves > only smbd, which can probably be pre-execed. File descriptor passing could also work for smbd. > >> >>> >>>> > Protecting against the abuse and misuse of execve() is something that is >>>> > better done with the host's access controls (traditional DAC, MAC via the >>>> > LSM, etc.). >>>> >>>> How about seccomp mode selected by command line switch -seccomp, in >>>> which bind/connect/open/execve are forbidden? The functionality >>>> remaining would be somewhat limited (can't migrate or use SMB etc. >>>> until refactoring of QEMU), but that way seccomp jail would be much >>>> tighter. >>> >>> When I spoke to Anthony about this earlier (offline, sorry) he was opposed to >>> requiring any switches or user interaction to enable seccomp. I'm not sure if >>> his stance on this has changed any over the past few months. >> >> There could be two modes, strict mode (-seccomp) and default mode >> (only some syscalls blocked). With the future decomposed QEMU, strict >> seccomp mode would be default and the switch would be obsoleted. If >> the decomposition is planned to happen soonish, adding the switch >> would be just churn. > > We have decomposed qemu to some extent, in that privileged operations > happen in libvirt. So the modes make sense - qemu has no idea whether a > privileged management system is controlling it or not. So with -seccomp, libvirt could tell QEMU that for example open(), execve(), bind() and connect() will never be needed? > >> >>> >>> In my perfect world, we would have a decomposed QEMU that functions as a >>> series of processes connected via some sort of IPC; the exact divisions are a >>> bit TBD and beyond the scope of this discussion. In this scenario we would be >>> able to restrict QEMU with sVirt and seccomp to a much higher degree than we >>> could with the current monolithic QEMU. >>> >>> I don't expect to see my perfect world any time soon, but in the meantime we >>> can still improve the security of QEMU on Linux with these seccomp patches and >>> for that reason I think it's a win. Since these patches don't expose anything >>> at runtime (no knobs, switches, etc.) we leave ourselves plenty of flexibility >>> for changing things in the future. >> >> Yes, I'm much in favor of adding seccomp support soon. But I just >> wonder if this is really the best level of security we can reach now, >> not assuming decomposed QEMU, but just minor tweaks? > > We might disable mprotect(PROT_EXEC) if running with kvm. > > -- > error compiling committee.c: too many arguments to function > >
On 06/19/2012 09:58 PM, Blue Swirl wrote: >>> At least qemu-ifup/down scripts, migration exec and smbd have been >>> mentioned. Only the system calls made by smbd (for some version of it) >>> can be known. The user could specify arbitrary commands for the >>> others, those could be assumed to use some common (large) subset of >>> system calls but I think the security value would be close to zero >>> then. >> >> We're not trying to protect against the user, but against the guest. If >> we assume the user wrote those scripts with care so they cannot be >> exploited by the guest, then we are okay. > > My concern was that first we could accidentally filter a system call > that changes the script or executable behavior, much like sendmail + > capabilities bug, and then a guest could trigger running this > script/executable and exploit the changed behavior. Ah, I see. I agree this is dangerous. We should probably disable exec if we seccomp. >> >> We have decomposed qemu to some extent, in that privileged operations >> happen in libvirt. So the modes make sense - qemu has no idea whether a >> privileged management system is controlling it or not. > > So with -seccomp, libvirt could tell QEMU that for example open(), > execve(), bind() and connect() will never be needed? Yes.
On 06/21/2012 03:04 AM, Avi Kivity wrote: > On 06/19/2012 09:58 PM, Blue Swirl wrote: >>>> At least qemu-ifup/down scripts, migration exec and smbd have been >>>> mentioned. Only the system calls made by smbd (for some version of it) >>>> can be known. The user could specify arbitrary commands for the >>>> others, those could be assumed to use some common (large) subset of >>>> system calls but I think the security value would be close to zero >>>> then. >>> >>> We're not trying to protect against the user, but against the guest. If >>> we assume the user wrote those scripts with care so they cannot be >>> exploited by the guest, then we are okay. >> >> My concern was that first we could accidentally filter a system call >> that changes the script or executable behavior, much like sendmail + >> capabilities bug, and then a guest could trigger running this >> script/executable and exploit the changed behavior. > > Ah, I see. I agree this is dangerous. We should probably disable exec > if we seccomp. There's no great place to jump into this thread so I guess I'll do it here. There is absolutely no doubt that white-listing syscalls that we currently use provides an improvement in security. We need to assume: 1) QEMU is run as an unprivileged user 2) QEMU is already heavily restricted by SELinux In this case, seccomp() is not being used to replace MAC or DAC. It's supplementing both of them by additionally filtering out syscalls that may have unknown kernel exploits in them. That's all this initial effort is about. Since it's scope is so limited, we can simply enable it unconditionally too. After we have this initial support, then we can look at a -sandbox option. This open could prevent things like open()/execve() but that will come at a cost of features. I think the reasonable thing to do for -sandbox is to basically focus on the set of syscalls that QEMU would use if it were launched under libvirt. We should obviously make improvements (things like -blockdev) to make this even more restrictive. Who knows, maybe we end up having multiple types of sandboxes. A '-sandbox libvirt' and a '-sandbox user' where the later is focused on the typical usage of an unprivileged user. But this is all stuff that can come later. We solve a big problem by just getting the initial whitelist support in. Regards, Anthony Liguori > >>> >>> We have decomposed qemu to some extent, in that privileged operations >>> happen in libvirt. So the modes make sense - qemu has no idea whether a >>> privileged management system is controlling it or not. >> >> So with -seccomp, libvirt could tell QEMU that for example open(), >> execve(), bind() and connect() will never be needed? > > Yes. >
On Wed, Jun 27, 2012 at 9:25 PM, Anthony Liguori <anthony@codemonkey.ws> wrote: > On 06/21/2012 03:04 AM, Avi Kivity wrote: >> >> On 06/19/2012 09:58 PM, Blue Swirl wrote: >>>>> >>>>> At least qemu-ifup/down scripts, migration exec and smbd have been >>>>> mentioned. Only the system calls made by smbd (for some version of it) >>>>> can be known. The user could specify arbitrary commands for the >>>>> others, those could be assumed to use some common (large) subset of >>>>> system calls but I think the security value would be close to zero >>>>> then. >>>> >>>> >>>> We're not trying to protect against the user, but against the guest. If >>>> we assume the user wrote those scripts with care so they cannot be >>>> exploited by the guest, then we are okay. >>> >>> >>> My concern was that first we could accidentally filter a system call >>> that changes the script or executable behavior, much like sendmail + >>> capabilities bug, and then a guest could trigger running this >>> script/executable and exploit the changed behavior. >> >> >> Ah, I see. I agree this is dangerous. We should probably disable exec >> if we seccomp. > > > There's no great place to jump into this thread so I guess I'll do it here. > > There is absolutely no doubt that white-listing syscalls that we currently > use provides an improvement in security. > > We need to assume: > > 1) QEMU is run as an unprivileged user > > 2) QEMU is already heavily restricted by SELinux > > In this case, seccomp() is not being used to replace MAC or DAC. It's > supplementing both of them by additionally filtering out syscalls that may > have unknown kernel exploits in them. That's all this initial effort is > about. Since it's scope is so limited, we can simply enable it > unconditionally too. I don't think the scope is limited in a safe way. What is the set of system calls that can't ever cause problems to any possible ifup/down scripts, migration exec helpers and various versions of smbd? For example, unlink() is missing. What if the ifup/down script needs it for lock file cleanup? ftruncate()? Every socket syscalls in case LDAP is used to access user information by the libc? I think we can't define the safe set, except 'allow all'. I'd propose one of the following to avoid breakage: 1. Allow all system calls for the initial patch, refactor later to reduce the set. Useless until refactored. 2. Don't make seccomp mode enabled default, when enabled, forbid execve(). Limits functionality when enabled, no security benefit if not enabled. 3. Before enabling seccomp, fork a helper process without restrictions that is used to launch other programs. Needs some work. > > After we have this initial support, then we can look at a -sandbox option. > This open could prevent things like open()/execve() but that will come at a > cost of features. > > I think the reasonable thing to do for -sandbox is to basically focus on the > set of syscalls that QEMU would use if it were launched under libvirt. We > should obviously make improvements (things like -blockdev) to make this even > more restrictive. > > Who knows, maybe we end up having multiple types of sandboxes. A '-sandbox > libvirt' and a '-sandbox user' where the later is focused on the typical > usage of an unprivileged user. > > But this is all stuff that can come later. We solve a big problem by just > getting the initial whitelist support in. Fully agree, but we'd have to agree about what is a safe initial whitelist. > > Regards, > > Anthony Liguori > > >> >>>> >>>> We have decomposed qemu to some extent, in that privileged operations >>>> happen in libvirt. So the modes make sense - qemu has no idea whether a >>>> privileged management system is controlling it or not. >>> >>> >>> So with -seccomp, libvirt could tell QEMU that for example open(), >>> execve(), bind() and connect() will never be needed? >> >> >> Yes. >> >
On 06/28/2012 03:49 PM, Blue Swirl wrote: > On Wed, Jun 27, 2012 at 9:25 PM, Anthony Liguori <anthony@codemonkey.ws> wrote: >> On 06/21/2012 03:04 AM, Avi Kivity wrote: >>> >>> On 06/19/2012 09:58 PM, Blue Swirl wrote: >>>>>> >>>>>> At least qemu-ifup/down scripts, migration exec and smbd have been >>>>>> mentioned. Only the system calls made by smbd (for some version of it) >>>>>> can be known. The user could specify arbitrary commands for the >>>>>> others, those could be assumed to use some common (large) subset of >>>>>> system calls but I think the security value would be close to zero >>>>>> then. >>>>> >>>>> >>>>> We're not trying to protect against the user, but against the guest. If >>>>> we assume the user wrote those scripts with care so they cannot be >>>>> exploited by the guest, then we are okay. >>>> >>>> >>>> My concern was that first we could accidentally filter a system call >>>> that changes the script or executable behavior, much like sendmail + >>>> capabilities bug, and then a guest could trigger running this >>>> script/executable and exploit the changed behavior. >>> >>> >>> Ah, I see. I agree this is dangerous. We should probably disable exec >>> if we seccomp. >> >> >> There's no great place to jump into this thread so I guess I'll do it here. >> >> There is absolutely no doubt that white-listing syscalls that we currently >> use provides an improvement in security. >> >> We need to assume: >> >> 1) QEMU is run as an unprivileged user >> >> 2) QEMU is already heavily restricted by SELinux >> >> In this case, seccomp() is not being used to replace MAC or DAC. It's >> supplementing both of them by additionally filtering out syscalls that may >> have unknown kernel exploits in them. That's all this initial effort is >> about. Since it's scope is so limited, we can simply enable it >> unconditionally too. > > I don't think the scope is limited in a safe way. What is the set of > system calls that can't ever cause problems to any possible ifup/down > scripts, migration exec helpers and various versions of smbd? > > For example, unlink() is missing. What if the ifup/down script needs > it for lock file cleanup? ftruncate()? Every socket syscalls in case > LDAP is used to access user information by the libc? > > I think we can't define the safe set, except 'allow all'. I'd propose > one of the following to avoid breakage: > > 1. Allow all system calls for the initial patch, refactor later to > reduce the set. Useless until refactored. > > 2. Don't make seccomp mode enabled default, when enabled, forbid > execve(). Limits functionality when enabled, no security benefit if > not enabled. It should be noted that PR_SET_NO_NEW_PRIVS is set by default when the seccomp filter is enabled by libseccomp. This prevents any new privileges from being granted on execve. > > 3. Before enabling seccomp, fork a helper process without restrictions > that is used to launch other programs. Needs some work. > >> >> After we have this initial support, then we can look at a -sandbox option. >> This open could prevent things like open()/execve() but that will come at a >> cost of features. >> >> I think the reasonable thing to do for -sandbox is to basically focus on the >> set of syscalls that QEMU would use if it were launched under libvirt. We >> should obviously make improvements (things like -blockdev) to make this even >> more restrictive. >> >> Who knows, maybe we end up having multiple types of sandboxes. A '-sandbox >> libvirt' and a '-sandbox user' where the later is focused on the typical >> usage of an unprivileged user. >> >> But this is all stuff that can come later. We solve a big problem by just >> getting the initial whitelist support in. > > Fully agree, but we'd have to agree about what is a safe initial whitelist. > >> >> Regards, >> >> Anthony Liguori >> >> >>> >>>>> >>>>> We have decomposed qemu to some extent, in that privileged operations >>>>> happen in libvirt. So the modes make sense - qemu has no idea whether a >>>>> privileged management system is controlling it or not. >>>> >>>> >>>> So with -seccomp, libvirt could tell QEMU that for example open(), >>>> execve(), bind() and connect() will never be needed? >>> >>> >>> Yes. >>> >> >
On Fri, Jun 29, 2012 at 3:27 PM, Corey Bryant <coreyb@linux.vnet.ibm.com> wrote: > > > On 06/28/2012 03:49 PM, Blue Swirl wrote: >> >> On Wed, Jun 27, 2012 at 9:25 PM, Anthony Liguori <anthony@codemonkey.ws> >> wrote: >>> >>> On 06/21/2012 03:04 AM, Avi Kivity wrote: >>>> >>>> >>>> On 06/19/2012 09:58 PM, Blue Swirl wrote: >>>>>>> >>>>>>> >>>>>>> At least qemu-ifup/down scripts, migration exec and smbd have been >>>>>>> mentioned. Only the system calls made by smbd (for some version of >>>>>>> it) >>>>>>> can be known. The user could specify arbitrary commands for the >>>>>>> others, those could be assumed to use some common (large) subset of >>>>>>> system calls but I think the security value would be close to zero >>>>>>> then. >>>>>> >>>>>> >>>>>> >>>>>> We're not trying to protect against the user, but against the guest. >>>>>> If >>>>>> we assume the user wrote those scripts with care so they cannot be >>>>>> exploited by the guest, then we are okay. >>>>> >>>>> >>>>> >>>>> My concern was that first we could accidentally filter a system call >>>>> that changes the script or executable behavior, much like sendmail + >>>>> capabilities bug, and then a guest could trigger running this >>>>> script/executable and exploit the changed behavior. >>>> >>>> >>>> >>>> Ah, I see. I agree this is dangerous. We should probably disable exec >>>> if we seccomp. >>> >>> >>> >>> There's no great place to jump into this thread so I guess I'll do it >>> here. >>> >>> There is absolutely no doubt that white-listing syscalls that we >>> currently >>> use provides an improvement in security. >>> >>> We need to assume: >>> >>> 1) QEMU is run as an unprivileged user >>> >>> 2) QEMU is already heavily restricted by SELinux >>> >>> In this case, seccomp() is not being used to replace MAC or DAC. It's >>> supplementing both of them by additionally filtering out syscalls that >>> may >>> have unknown kernel exploits in them. That's all this initial effort is >>> about. Since it's scope is so limited, we can simply enable it >>> unconditionally too. >> >> >> I don't think the scope is limited in a safe way. What is the set of >> system calls that can't ever cause problems to any possible ifup/down >> scripts, migration exec helpers and various versions of smbd? >> >> For example, unlink() is missing. What if the ifup/down script needs >> it for lock file cleanup? ftruncate()? Every socket syscalls in case >> LDAP is used to access user information by the libc? >> >> I think we can't define the safe set, except 'allow all'. I'd propose >> one of the following to avoid breakage: >> >> 1. Allow all system calls for the initial patch, refactor later to >> reduce the set. Useless until refactored. >> >> 2. Don't make seccomp mode enabled default, when enabled, forbid >> execve(). Limits functionality when enabled, no security benefit if >> not enabled. > > > It should be noted that PR_SET_NO_NEW_PRIVS is set by default when the > seccomp filter is enabled by libseccomp. This prevents any new privileges > from being granted on execve. This is probably getting very hypothetical, but what happens if the ifup/down scripts need to run a setuid/gid helper or a helper with additional privileges from file system capabilities? > > >> >> 3. Before enabling seccomp, fork a helper process without restrictions >> that is used to launch other programs. Needs some work. >> >>> >>> After we have this initial support, then we can look at a -sandbox >>> option. >>> This open could prevent things like open()/execve() but that will come >>> at a >>> cost of features. >>> >>> I think the reasonable thing to do for -sandbox is to basically focus on >>> the >>> set of syscalls that QEMU would use if it were launched under libvirt. >>> We >>> should obviously make improvements (things like -blockdev) to make this >>> even >>> more restrictive. >>> >>> Who knows, maybe we end up having multiple types of sandboxes. A >>> '-sandbox >>> libvirt' and a '-sandbox user' where the later is focused on the typical >>> usage of an unprivileged user. >>> >>> But this is all stuff that can come later. We solve a big problem by >>> just >>> getting the initial whitelist support in. >> >> >> Fully agree, but we'd have to agree about what is a safe initial >> whitelist. >> >>> >>> Regards, >>> >>> Anthony Liguori >>> >>> >>>> >>>>>> >>>>>> We have decomposed qemu to some extent, in that privileged operations >>>>>> happen in libvirt. So the modes make sense - qemu has no idea whether >>>>>> a >>>>>> privileged management system is controlling it or not. >>>>> >>>>> >>>>> >>>>> So with -seccomp, libvirt could tell QEMU that for example open(), >>>>> execve(), bind() and connect() will never be needed? >>>> >>>> >>>> >>>> Yes. >>>> >>> >> > > -- > Regards, > Corey > >
On 06/29/2012 04:00 PM, Blue Swirl wrote: > On Fri, Jun 29, 2012 at 3:27 PM, Corey Bryant <coreyb@linux.vnet.ibm.com> wrote: >> >> >> On 06/28/2012 03:49 PM, Blue Swirl wrote: >>> >>> On Wed, Jun 27, 2012 at 9:25 PM, Anthony Liguori <anthony@codemonkey.ws> >>> wrote: >>>> >>>> On 06/21/2012 03:04 AM, Avi Kivity wrote: >>>>> >>>>> >>>>> On 06/19/2012 09:58 PM, Blue Swirl wrote: >>>>>>>> >>>>>>>> >>>>>>>> At least qemu-ifup/down scripts, migration exec and smbd have been >>>>>>>> mentioned. Only the system calls made by smbd (for some version of >>>>>>>> it) >>>>>>>> can be known. The user could specify arbitrary commands for the >>>>>>>> others, those could be assumed to use some common (large) subset of >>>>>>>> system calls but I think the security value would be close to zero >>>>>>>> then. >>>>>>> >>>>>>> >>>>>>> >>>>>>> We're not trying to protect against the user, but against the guest. >>>>>>> If >>>>>>> we assume the user wrote those scripts with care so they cannot be >>>>>>> exploited by the guest, then we are okay. >>>>>> >>>>>> >>>>>> >>>>>> My concern was that first we could accidentally filter a system call >>>>>> that changes the script or executable behavior, much like sendmail + >>>>>> capabilities bug, and then a guest could trigger running this >>>>>> script/executable and exploit the changed behavior. >>>>> >>>>> >>>>> >>>>> Ah, I see. I agree this is dangerous. We should probably disable exec >>>>> if we seccomp. >>>> >>>> >>>> >>>> There's no great place to jump into this thread so I guess I'll do it >>>> here. >>>> >>>> There is absolutely no doubt that white-listing syscalls that we >>>> currently >>>> use provides an improvement in security. >>>> >>>> We need to assume: >>>> >>>> 1) QEMU is run as an unprivileged user >>>> >>>> 2) QEMU is already heavily restricted by SELinux >>>> >>>> In this case, seccomp() is not being used to replace MAC or DAC. It's >>>> supplementing both of them by additionally filtering out syscalls that >>>> may >>>> have unknown kernel exploits in them. That's all this initial effort is >>>> about. Since it's scope is so limited, we can simply enable it >>>> unconditionally too. >>> >>> >>> I don't think the scope is limited in a safe way. What is the set of >>> system calls that can't ever cause problems to any possible ifup/down >>> scripts, migration exec helpers and various versions of smbd? >>> >>> For example, unlink() is missing. What if the ifup/down script needs >>> it for lock file cleanup? ftruncate()? Every socket syscalls in case >>> LDAP is used to access user information by the libc? >>> >>> I think we can't define the safe set, except 'allow all'. I'd propose >>> one of the following to avoid breakage: >>> >>> 1. Allow all system calls for the initial patch, refactor later to >>> reduce the set. Useless until refactored. >>> >>> 2. Don't make seccomp mode enabled default, when enabled, forbid >>> execve(). Limits functionality when enabled, no security benefit if >>> not enabled. >> >> >> It should be noted that PR_SET_NO_NEW_PRIVS is set by default when the >> seccomp filter is enabled by libseccomp. This prevents any new privileges >> from being granted on execve. > > This is probably getting very hypothetical, but what happens if the > ifup/down scripts need to run a setuid/gid helper or a helper with > additional privileges from file system capabilities? > I believe execve() will either fail or not grant the privileges.
Il 18/06/2012 23:53, Corey Bryant ha scritto: >> >> Can each thread have separate seccomp whitelists? For example CPU >> threads should not need pretty much anything but the I/O thread needs >> I/O. >> > > No, seccomp filters are defined and enforced at the process level. Perhaps we can add (at the kernel level) a way for seccomp filters to examine the current tid. Paolo
On Sun, Jul 1, 2012 at 8:25 AM, Paolo Bonzini <pbonzini@redhat.com> wrote: > Il 18/06/2012 23:53, Corey Bryant ha scritto: >>> >>> Can each thread have separate seccomp whitelists? For example CPU >>> threads should not need pretty much anything but the I/O thread needs >>> I/O. >>> >> >> No, seccomp filters are defined and enforced at the process level. > > Perhaps we can add (at the kernel level) a way for seccomp filters to > examine the current tid. seccomp filters are attached to the task_struct and apply per "thread" or per process since they both get their own task_structs. (For Linux, process==thread with shared resources.) Filter programs are also inherited across clone/fork, so it's possible to install a "global" filter program which applies which is inherited during thread creation, then apply per-thread refinements by stacking on additional filters (at the cost of additional evaluation time). hth! will
On 07/01/2012 10:18 PM, Will Drewry wrote: > On Sun, Jul 1, 2012 at 8:25 AM, Paolo Bonzini <pbonzini@redhat.com> wrote: >> Il 18/06/2012 23:53, Corey Bryant ha scritto: >>>> >>>> Can each thread have separate seccomp whitelists? For example CPU >>>> threads should not need pretty much anything but the I/O thread needs >>>> I/O. >>>> >>> >>> No, seccomp filters are defined and enforced at the process level. >> >> Perhaps we can add (at the kernel level) a way for seccomp filters to >> examine the current tid. Sorry for the confusion. I corrected my statement in a later thread based on Will's input: http://lists.nongnu.org/archive/html/qemu-devel/2012-06/msg03212.html > > seccomp filters are attached to the task_struct and apply per "thread" > or per process since they both get their own task_structs. (For > Linux, process==thread with shared resources.) Filter programs are > also inherited across clone/fork, so it's possible to install a > "global" filter program which applies which is inherited during thread > creation, then apply per-thread refinements by stacking on additional > filters (at the cost of additional evaluation time). > > hth! > will > Thanks!
On 06/28/2012 03:49 PM, Blue Swirl wrote: > On Wed, Jun 27, 2012 at 9:25 PM, Anthony Liguori <anthony@codemonkey.ws> wrote: >> On 06/21/2012 03:04 AM, Avi Kivity wrote: >>> >>> On 06/19/2012 09:58 PM, Blue Swirl wrote: >>>>>> >>>>>> At least qemu-ifup/down scripts, migration exec and smbd have been >>>>>> mentioned. Only the system calls made by smbd (for some version of it) >>>>>> can be known. The user could specify arbitrary commands for the >>>>>> others, those could be assumed to use some common (large) subset of >>>>>> system calls but I think the security value would be close to zero >>>>>> then. >>>>> >>>>> >>>>> We're not trying to protect against the user, but against the guest. If >>>>> we assume the user wrote those scripts with care so they cannot be >>>>> exploited by the guest, then we are okay. >>>> >>>> >>>> My concern was that first we could accidentally filter a system call >>>> that changes the script or executable behavior, much like sendmail + >>>> capabilities bug, and then a guest could trigger running this >>>> script/executable and exploit the changed behavior. >>> >>> >>> Ah, I see. I agree this is dangerous. We should probably disable exec >>> if we seccomp. >> >> >> There's no great place to jump into this thread so I guess I'll do it here. >> >> There is absolutely no doubt that white-listing syscalls that we currently >> use provides an improvement in security. >> >> We need to assume: >> >> 1) QEMU is run as an unprivileged user >> >> 2) QEMU is already heavily restricted by SELinux >> >> In this case, seccomp() is not being used to replace MAC or DAC. It's >> supplementing both of them by additionally filtering out syscalls that may >> have unknown kernel exploits in them. That's all this initial effort is >> about. Since it's scope is so limited, we can simply enable it >> unconditionally too. > > I don't think the scope is limited in a safe way. What is the set of > system calls that can't ever cause problems to any possible ifup/down > scripts, migration exec helpers and various versions of smbd? > > For example, unlink() is missing. What if the ifup/down script needs > it for lock file cleanup? ftruncate()? Every socket syscalls in case > LDAP is used to access user information by the libc? > > I think we can't define the safe set, except 'allow all'. I'd propose > one of the following to avoid breakage: > > 1. Allow all system calls for the initial patch, refactor later to > reduce the set. Useless until refactored. One thing I like about starting with a known subset of syscalls used by QEMU is that it forces us to expand the whitelist if we come across more syscalls that QEMU uses. An issue with this approach is that if seccomp kills QEMU for using a disallowed syscall, I don't think we know what syscall it is. (At least, I don't think it is accessible anywhere.) This is good for security but makes it hard for developers who are debugging. Would it make sense to have the ability to configure QEMU in either: 1) seccomp kill mode (this is what the existing patches do), or 2) seccomp debug mode? In debug mode we could trap on the failing syscall (using SCMP_ACT_TRAP), determine the syscall value, and issue an error message that displays the syscall value. The emulator() function here gives an idea of how this could be done: https://lkml.org/lkml/2012/4/12/449 > > 2. Don't make seccomp mode enabled default, when enabled, forbid > execve(). Limits functionality when enabled, no security benefit if > not enabled. > > 3. Before enabling seccomp, fork a helper process without restrictions > that is used to launch other programs. Needs some work. > >> >> After we have this initial support, then we can look at a -sandbox option. >> This open could prevent things like open()/execve() but that will come at a >> cost of features. >> >> I think the reasonable thing to do for -sandbox is to basically focus on the >> set of syscalls that QEMU would use if it were launched under libvirt. We >> should obviously make improvements (things like -blockdev) to make this even >> more restrictive. >> >> Who knows, maybe we end up having multiple types of sandboxes. A '-sandbox >> libvirt' and a '-sandbox user' where the later is focused on the typical >> usage of an unprivileged user. >> >> But this is all stuff that can come later. We solve a big problem by just >> getting the initial whitelist support in. > > Fully agree, but we'd have to agree about what is a safe initial whitelist. > >> >> Regards, >> >> Anthony Liguori >> >> >>> >>>>> >>>>> We have decomposed qemu to some extent, in that privileged operations >>>>> happen in libvirt. So the modes make sense - qemu has no idea whether a >>>>> privileged management system is controlling it or not. >>>> >>>> >>>> So with -seccomp, libvirt could tell QEMU that for example open(), >>>> execve(), bind() and connect() will never be needed? >>> >>> >>> Yes. >>> >> >
On Mon, Jul 2, 2012 at 6:05 PM, Corey Bryant <coreyb@linux.vnet.ibm.com> wrote: > > > On 06/28/2012 03:49 PM, Blue Swirl wrote: >> >> On Wed, Jun 27, 2012 at 9:25 PM, Anthony Liguori <anthony@codemonkey.ws> >> wrote: >>> >>> On 06/21/2012 03:04 AM, Avi Kivity wrote: >>>> >>>> >>>> On 06/19/2012 09:58 PM, Blue Swirl wrote: >>>>>>> >>>>>>> >>>>>>> At least qemu-ifup/down scripts, migration exec and smbd have been >>>>>>> mentioned. Only the system calls made by smbd (for some version of >>>>>>> it) >>>>>>> can be known. The user could specify arbitrary commands for the >>>>>>> others, those could be assumed to use some common (large) subset of >>>>>>> system calls but I think the security value would be close to zero >>>>>>> then. >>>>>> >>>>>> >>>>>> >>>>>> We're not trying to protect against the user, but against the guest. >>>>>> If >>>>>> we assume the user wrote those scripts with care so they cannot be >>>>>> exploited by the guest, then we are okay. >>>>> >>>>> >>>>> >>>>> My concern was that first we could accidentally filter a system call >>>>> that changes the script or executable behavior, much like sendmail + >>>>> capabilities bug, and then a guest could trigger running this >>>>> script/executable and exploit the changed behavior. >>>> >>>> >>>> >>>> Ah, I see. I agree this is dangerous. We should probably disable exec >>>> if we seccomp. >>> >>> >>> >>> There's no great place to jump into this thread so I guess I'll do it >>> here. >>> >>> There is absolutely no doubt that white-listing syscalls that we >>> currently >>> use provides an improvement in security. >>> >>> We need to assume: >>> >>> 1) QEMU is run as an unprivileged user >>> >>> 2) QEMU is already heavily restricted by SELinux >>> >>> In this case, seccomp() is not being used to replace MAC or DAC. It's >>> supplementing both of them by additionally filtering out syscalls that >>> may >>> have unknown kernel exploits in them. That's all this initial effort is >>> about. Since it's scope is so limited, we can simply enable it >>> unconditionally too. >> >> >> I don't think the scope is limited in a safe way. What is the set of >> system calls that can't ever cause problems to any possible ifup/down >> scripts, migration exec helpers and various versions of smbd? >> >> For example, unlink() is missing. What if the ifup/down script needs >> it for lock file cleanup? ftruncate()? Every socket syscalls in case >> LDAP is used to access user information by the libc? >> >> I think we can't define the safe set, except 'allow all'. I'd propose >> one of the following to avoid breakage: >> >> 1. Allow all system calls for the initial patch, refactor later to >> reduce the set. Useless until refactored. > > > One thing I like about starting with a known subset of syscalls used by QEMU > is that it forces us to expand the whitelist if we come across more syscalls > that QEMU uses. Finding out what QEMU uses is the relatively easy part. Finding out what the external helpers might use seems to be impossible. > > An issue with this approach is that if seccomp kills QEMU for using a > disallowed syscall, I don't think we know what syscall it is. (At least, I > don't think it is accessible anywhere.) This is good for security but makes > it hard for developers who are debugging. > > Would it make sense to have the ability to configure QEMU in either: > 1) seccomp kill mode (this is what the existing patches do), or > 2) seccomp debug mode? > > In debug mode we could trap on the failing syscall (using SCMP_ACT_TRAP), > determine the syscall value, and issue an error message that displays the > syscall value. I think that it would be nice and it would be useful also after any refactoring. > > The emulator() function here gives an idea of how this could be done: > https://lkml.org/lkml/2012/4/12/449 > > >> >> 2. Don't make seccomp mode enabled default, when enabled, forbid >> execve(). Limits functionality when enabled, no security benefit if >> not enabled. >> >> 3. Before enabling seccomp, fork a helper process without restrictions >> that is used to launch other programs. Needs some work. >> >>> >>> After we have this initial support, then we can look at a -sandbox >>> option. >>> This open could prevent things like open()/execve() but that will come >>> at a >>> cost of features. >>> >>> I think the reasonable thing to do for -sandbox is to basically focus on >>> the >>> set of syscalls that QEMU would use if it were launched under libvirt. >>> We >>> should obviously make improvements (things like -blockdev) to make this >>> even >>> more restrictive. >>> >>> Who knows, maybe we end up having multiple types of sandboxes. A >>> '-sandbox >>> libvirt' and a '-sandbox user' where the later is focused on the typical >>> usage of an unprivileged user. >>> >>> But this is all stuff that can come later. We solve a big problem by >>> just >>> getting the initial whitelist support in. >> >> >> Fully agree, but we'd have to agree about what is a safe initial >> whitelist. >> >>> >>> Regards, >>> >>> Anthony Liguori >>> >>> >>>> >>>>>> >>>>>> We have decomposed qemu to some extent, in that privileged operations >>>>>> happen in libvirt. So the modes make sense - qemu has no idea whether >>>>>> a >>>>>> privileged management system is controlling it or not. >>>>> >>>>> >>>>> >>>>> So with -seccomp, libvirt could tell QEMU that for example open(), >>>>> execve(), bind() and connect() will never be needed? >>>> >>>> >>>> >>>> Yes. >>>> >>> >> > > -- > Regards, > Corey > >
diff --git a/qemu-seccomp.c b/qemu-seccomp.c new file mode 100644 index 0000000..048b7ba --- /dev/null +++ b/qemu-seccomp.c @@ -0,0 +1,73 @@ +#include <stdio.h> +#include <seccomp.h> +#include "qemu-seccomp.h" + +static struct QemuSeccompSyscall seccomp_whitelist[] = { + { SCMP_SYS(timer_settime), 255 }, + { SCMP_SYS(timer_gettime), 254 }, + { SCMP_SYS(futex), 253 }, + { SCMP_SYS(select), 252 }, + { SCMP_SYS(recvfrom), 251 }, + { SCMP_SYS(sendto), 250 }, + { SCMP_SYS(read), 249 }, + { SCMP_SYS(brk), 248 }, + { SCMP_SYS(clone), 247 }, + { SCMP_SYS(mmap), 247 }, + { SCMP_SYS(mprotect), 246 }, + { SCMP_SYS(ioctl), 245 }, + { SCMP_SYS(recvmsg), 245 }, + { SCMP_SYS(sendmsg), 245 }, + { SCMP_SYS(accept), 245 }, + { SCMP_SYS(connect), 245 }, + { SCMP_SYS(bind), 245 }, + { SCMP_SYS(listen), 245 }, + { SCMP_SYS(ioctl), 245 }, + { SCMP_SYS(eventfd), 245 }, + { SCMP_SYS(rt_sigprocmask), 245 }, + { SCMP_SYS(write), 244 }, + { SCMP_SYS(fcntl), 243 }, + { SCMP_SYS(tgkill), 242 }, + { SCMP_SYS(rt_sigaction), 242 }, + { SCMP_SYS(pipe2), 242 }, + { SCMP_SYS(munmap), 242 }, + { SCMP_SYS(mremap), 242 }, + { SCMP_SYS(getsockname), 242 }, + { SCMP_SYS(getpeername), 242 }, + { SCMP_SYS(fdatasync), 242 }, + { SCMP_SYS(close), 242 } +}; + +#define seccomp_whitelist_count ARRAY_SIZE(seccomp_whitelist) + +int seccomp_start(void) +{ + int rc = 0; + unsigned int i = 0; + + rc = seccomp_init(SCMP_ACT_KILL); + if (rc < 0) { + goto seccomp_return; + } + + for (i = 0; i < seccomp_whitelist_count; i++) { + rc = seccomp_rule_add(SCMP_ACT_ALLOW, seccomp_whitelist[i].num, 0); + if (rc < 0) { + goto seccomp_return; + } + rc = seccomp_syscall_priority(seccomp_whitelist[i].num, + seccomp_whitelist[i].priority); + if (rc < 0) { + goto seccomp_return; + } + } + + rc = seccomp_load(); + + seccomp_return: + seccomp_release(); + if (rc < 0) { + fprintf(stderr, + "ERROR: failed to configure the seccomp syscall filter in the kernel\n"); + } + return rc; +} diff --git a/qemu-seccomp.h b/qemu-seccomp.h new file mode 100644 index 0000000..3bbdd87 --- /dev/null +++ b/qemu-seccomp.h @@ -0,0 +1,9 @@ +#include <seccomp.h> +#include "osdep.h" + +struct QemuSeccompSyscall { + int32_t num; + uint8_t priority; +}; + +int seccomp_start(void); diff --git a/vl.c b/vl.c index 204d85b..315afaf 100644 --- a/vl.c +++ b/vl.c @@ -61,6 +61,9 @@ #include <linux/ppdev.h> #include <linux/parport.h> +#ifdef CONFIG_LIBSECCOMP +#include "qemu-seccomp.h" +#endif #endif #ifdef __sun__ #include <sys/stat.h> @@ -2296,6 +2299,10 @@ int main(int argc, char **argv, char **envp) const char *trace_events = NULL; const char *trace_file = NULL; +#ifdef CONFIG_LIBSECCOMP + seccomp_start(); +#endif + atexit(qemu_run_exit_notifiers); error_set_progname(argv[0]);
I added a syscall struct using priority levels as described in the libseccomp man page. The priority numbers are based to the frequency they appear in a sample strace from a regular qemu guest run under libvirt. Libseccomp generates linear BPF code to filter system calls, those rules are read one after another. The priority system places the most common rules first in order to reduce the overhead when processing them. Also, since this is just a first RFC, the whitelist is a little raw. We might need your help to improve, test and fine tune the set of system calls. v2: Fixed some style issues Removed code from vl.c and created qemu-seccomp.[ch] Now using ARRAY_SIZE macro Added more syscalls without priority/frequency set yet Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com> --- qemu-seccomp.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ qemu-seccomp.h | 9 +++++++ vl.c | 7 ++++++ 3 files changed, 89 insertions(+) create mode 100644 qemu-seccomp.c create mode 100644 qemu-seccomp.h