diff mbox

[RFC,PATCHv2,2/2] Adding basic calls to libseccomp in vl.c

Message ID f2fa29790fb19e6596e1cda0d441e0f45730b533.1339614945.git.otubo@linux.vnet.ibm.com
State New
Headers show

Commit Message

Eduardo Otubo June 13, 2012, 7:20 p.m. UTC
I added a syscall struct using priority levels as described in the
libseccomp man page. The priority numbers are based to the frequency
they appear in a sample strace from a regular qemu guest run under
libvirt.

Libseccomp generates linear BPF code to filter system calls, those rules
are read one after another. The priority system places the most common
rules first in order to reduce the overhead when processing them.

Also, since this is just a first RFC, the whitelist is a little raw. We
might need your help to improve, test and fine tune the set of system
calls.

v2: Fixed some style issues
	Removed code from vl.c and created qemu-seccomp.[ch]
	Now using ARRAY_SIZE macro
	Added more syscalls without priority/frequency set yet

Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
---
 qemu-seccomp.c |   73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 qemu-seccomp.h |    9 +++++++
 vl.c           |    7 ++++++
 3 files changed, 89 insertions(+)
 create mode 100644 qemu-seccomp.c
 create mode 100644 qemu-seccomp.h

Comments

Blue Swirl June 13, 2012, 7:56 p.m. UTC | #1
On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo <otubo@linux.vnet.ibm.com> wrote:
> I added a syscall struct using priority levels as described in the
> libseccomp man page. The priority numbers are based to the frequency
> they appear in a sample strace from a regular qemu guest run under
> libvirt.
>
> Libseccomp generates linear BPF code to filter system calls, those rules
> are read one after another. The priority system places the most common
> rules first in order to reduce the overhead when processing them.
>
> Also, since this is just a first RFC, the whitelist is a little raw. We
> might need your help to improve, test and fine tune the set of system
> calls.
>
> v2: Fixed some style issues
>        Removed code from vl.c and created qemu-seccomp.[ch]
>        Now using ARRAY_SIZE macro
>        Added more syscalls without priority/frequency set yet
>
> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> ---
>  qemu-seccomp.c |   73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  qemu-seccomp.h |    9 +++++++
>  vl.c           |    7 ++++++
>  3 files changed, 89 insertions(+)
>  create mode 100644 qemu-seccomp.c
>  create mode 100644 qemu-seccomp.h
>
> diff --git a/qemu-seccomp.c b/qemu-seccomp.c
> new file mode 100644
> index 0000000..048b7ba
> --- /dev/null
> +++ b/qemu-seccomp.c
> @@ -0,0 +1,73 @@

Copyright and license info missing.

> +#include <stdio.h>
> +#include <seccomp.h>
> +#include "qemu-seccomp.h"
> +
> +static struct QemuSeccompSyscall seccomp_whitelist[] = {

'const'

> +    { SCMP_SYS(timer_settime), 255 },
> +    { SCMP_SYS(timer_gettime), 254 },
> +    { SCMP_SYS(futex), 253 },
> +    { SCMP_SYS(select), 252 },
> +    { SCMP_SYS(recvfrom), 251 },
> +    { SCMP_SYS(sendto), 250 },
> +    { SCMP_SYS(read), 249 },
> +    { SCMP_SYS(brk), 248 },
> +    { SCMP_SYS(clone), 247 },
> +    { SCMP_SYS(mmap), 247 },
> +    { SCMP_SYS(mprotect), 246 },
> +    { SCMP_SYS(ioctl), 245 },
> +    { SCMP_SYS(recvmsg), 245 },
> +    { SCMP_SYS(sendmsg), 245 },
> +    { SCMP_SYS(accept), 245 },
> +    { SCMP_SYS(connect), 245 },
> +    { SCMP_SYS(bind), 245 },

It would be nice to avoid connect() and bind(). Perhaps seccomp init
should be postponed to after all sockets have been created?

> +    { SCMP_SYS(listen), 245 },
> +    { SCMP_SYS(ioctl), 245 },
> +    { SCMP_SYS(eventfd), 245 },
> +    { SCMP_SYS(rt_sigprocmask), 245 },
> +    { SCMP_SYS(write), 244 },
> +    { SCMP_SYS(fcntl), 243 },
> +    { SCMP_SYS(tgkill), 242 },
> +    { SCMP_SYS(rt_sigaction), 242 },
> +    { SCMP_SYS(pipe2), 242 },
> +    { SCMP_SYS(munmap), 242 },
> +    { SCMP_SYS(mremap), 242 },
> +    { SCMP_SYS(getsockname), 242 },
> +    { SCMP_SYS(getpeername), 242 },
> +    { SCMP_SYS(fdatasync), 242 },
> +    { SCMP_SYS(close), 242 }
> +};
> +
> +#define seccomp_whitelist_count ARRAY_SIZE(seccomp_whitelist)

I'm not sure the #define helps much.

> +
> +int seccomp_start(void)
> +{
> +    int rc = 0;
> +    unsigned int i = 0;
> +
> +    rc = seccomp_init(SCMP_ACT_KILL);
> +    if (rc < 0) {
> +        goto seccomp_return;
> +    }
> +
> +    for (i = 0; i < seccomp_whitelist_count; i++) {
> +        rc = seccomp_rule_add(SCMP_ACT_ALLOW, seccomp_whitelist[i].num, 0);
> +        if (rc < 0) {
> +            goto seccomp_return;
> +        }
> +        rc = seccomp_syscall_priority(seccomp_whitelist[i].num,
> +                                      seccomp_whitelist[i].priority);
> +        if (rc < 0) {
> +            goto seccomp_return;
> +        }
> +    }
> +
> +    rc = seccomp_load();
> +
> +  seccomp_return:
> +    seccomp_release();
> +    if (rc < 0) {
> +        fprintf(stderr,
> +                "ERROR: failed to configure the seccomp syscall filter in the kernel\n");

Should this be fatal?

> +    }
> +    return rc;

Return value is not used.

> +}
> diff --git a/qemu-seccomp.h b/qemu-seccomp.h
> new file mode 100644
> index 0000000..3bbdd87
> --- /dev/null
> +++ b/qemu-seccomp.h
> @@ -0,0 +1,9 @@

Usual header protection #ifndeffery missing.

> +#include <seccomp.h>
> +#include "osdep.h"
> +
> +struct QemuSeccompSyscall {
> +    int32_t num;
> +    uint8_t priority;
> +};

This definition is not used elsewhere, so it should be internal to
qemu-seccomp.c.

> +
> +int seccomp_start(void);
> diff --git a/vl.c b/vl.c
> index 204d85b..315afaf 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -61,6 +61,9 @@
>
>  #include <linux/ppdev.h>
>  #include <linux/parport.h>
> +#ifdef CONFIG_LIBSECCOMP
> +#include "qemu-seccomp.h"
> +#endif
>  #endif
>  #ifdef __sun__
>  #include <sys/stat.h>
> @@ -2296,6 +2299,10 @@ int main(int argc, char **argv, char **envp)
>     const char *trace_events = NULL;
>     const char *trace_file = NULL;
>
> +#ifdef CONFIG_LIBSECCOMP
> +    seccomp_start();
> +#endif
> +
>     atexit(qemu_run_exit_notifiers);
>     error_set_progname(argv[0]);
>
> --
> 1.7.9.5
>
>
Daniel P. Berrangé June 13, 2012, 8:30 p.m. UTC | #2
On Wed, Jun 13, 2012 at 04:20:22PM -0300, Eduardo Otubo wrote:
> I added a syscall struct using priority levels as described in the
> libseccomp man page. The priority numbers are based to the frequency
> they appear in a sample strace from a regular qemu guest run under
> libvirt.
> 
> Libseccomp generates linear BPF code to filter system calls, those rules
> are read one after another. The priority system places the most common
> rules first in order to reduce the overhead when processing them.
> 
> Also, since this is just a first RFC, the whitelist is a little raw. We
> might need your help to improve, test and fine tune the set of system
> calls.
> 
> v2: Fixed some style issues
> 	Removed code from vl.c and created qemu-seccomp.[ch]
> 	Now using ARRAY_SIZE macro
> 	Added more syscalls without priority/frequency set yet
> 
> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> ---
>  qemu-seccomp.c |   73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  qemu-seccomp.h |    9 +++++++
>  vl.c           |    7 ++++++
>  3 files changed, 89 insertions(+)
>  create mode 100644 qemu-seccomp.c
>  create mode 100644 qemu-seccomp.h
> 
> diff --git a/qemu-seccomp.c b/qemu-seccomp.c
> new file mode 100644
> index 0000000..048b7ba
> --- /dev/null
> +++ b/qemu-seccomp.c
> @@ -0,0 +1,73 @@
> +#include <stdio.h>
> +#include <seccomp.h>
> +#include "qemu-seccomp.h"
> +
> +static struct QemuSeccompSyscall seccomp_whitelist[] = {
> +    { SCMP_SYS(timer_settime), 255 },
> +    { SCMP_SYS(timer_gettime), 254 },
> +    { SCMP_SYS(futex), 253 },
> +    { SCMP_SYS(select), 252 },
> +    { SCMP_SYS(recvfrom), 251 },
> +    { SCMP_SYS(sendto), 250 },
> +    { SCMP_SYS(read), 249 },
> +    { SCMP_SYS(brk), 248 },
> +    { SCMP_SYS(clone), 247 },
> +    { SCMP_SYS(mmap), 247 },
> +    { SCMP_SYS(mprotect), 246 },
> +    { SCMP_SYS(ioctl), 245 },
> +    { SCMP_SYS(recvmsg), 245 },
> +    { SCMP_SYS(sendmsg), 245 },
> +    { SCMP_SYS(accept), 245 },
> +    { SCMP_SYS(connect), 245 },
> +    { SCMP_SYS(bind), 245 },
> +    { SCMP_SYS(listen), 245 },
> +    { SCMP_SYS(ioctl), 245 },
> +    { SCMP_SYS(eventfd), 245 },
> +    { SCMP_SYS(rt_sigprocmask), 245 },
> +    { SCMP_SYS(write), 244 },
> +    { SCMP_SYS(fcntl), 243 },
> +    { SCMP_SYS(tgkill), 242 },
> +    { SCMP_SYS(rt_sigaction), 242 },
> +    { SCMP_SYS(pipe2), 242 },
> +    { SCMP_SYS(munmap), 242 },
> +    { SCMP_SYS(mremap), 242 },
> +    { SCMP_SYS(getsockname), 242 },
> +    { SCMP_SYS(getpeername), 242 },
> +    { SCMP_SYS(fdatasync), 242 },
> +    { SCMP_SYS(close), 242 }

execve(), so QEMU can run things like the ifup/down
scripts, the samba daemon (sic), exec: migration protocol,
etc, etc


Daniel
Daniel P. Berrangé June 13, 2012, 8:33 p.m. UTC | #3
On Wed, Jun 13, 2012 at 07:56:06PM +0000, Blue Swirl wrote:
> On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo <otubo@linux.vnet.ibm.com> wrote:
> > I added a syscall struct using priority levels as described in the
> > libseccomp man page. The priority numbers are based to the frequency
> > they appear in a sample strace from a regular qemu guest run under
> > libvirt.
> >
> > Libseccomp generates linear BPF code to filter system calls, those rules
> > are read one after another. The priority system places the most common
> > rules first in order to reduce the overhead when processing them.
> >
> > Also, since this is just a first RFC, the whitelist is a little raw. We
> > might need your help to improve, test and fine tune the set of system
> > calls.
> >
> > v2: Fixed some style issues
> >        Removed code from vl.c and created qemu-seccomp.[ch]
> >        Now using ARRAY_SIZE macro
> >        Added more syscalls without priority/frequency set yet
> >
> > Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> > ---
> >  qemu-seccomp.c |   73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  qemu-seccomp.h |    9 +++++++
> >  vl.c           |    7 ++++++
> >  3 files changed, 89 insertions(+)
> >  create mode 100644 qemu-seccomp.c
> >  create mode 100644 qemu-seccomp.h
> >
> > diff --git a/qemu-seccomp.c b/qemu-seccomp.c
> > new file mode 100644
> > index 0000000..048b7ba
> > --- /dev/null
> > +++ b/qemu-seccomp.c
> > @@ -0,0 +1,73 @@
> 
> Copyright and license info missing.
> 
> > +#include <stdio.h>
> > +#include <seccomp.h>
> > +#include "qemu-seccomp.h"
> > +
> > +static struct QemuSeccompSyscall seccomp_whitelist[] = {
> 
> 'const'
> 
> > +    { SCMP_SYS(timer_settime), 255 },
> > +    { SCMP_SYS(timer_gettime), 254 },
> > +    { SCMP_SYS(futex), 253 },
> > +    { SCMP_SYS(select), 252 },
> > +    { SCMP_SYS(recvfrom), 251 },
> > +    { SCMP_SYS(sendto), 250 },
> > +    { SCMP_SYS(read), 249 },
> > +    { SCMP_SYS(brk), 248 },
> > +    { SCMP_SYS(clone), 247 },
> > +    { SCMP_SYS(mmap), 247 },
> > +    { SCMP_SYS(mprotect), 246 },
> > +    { SCMP_SYS(ioctl), 245 },
> > +    { SCMP_SYS(recvmsg), 245 },
> > +    { SCMP_SYS(sendmsg), 245 },
> > +    { SCMP_SYS(accept), 245 },
> > +    { SCMP_SYS(connect), 245 },
> > +    { SCMP_SYS(bind), 245 },
> 
> It would be nice to avoid connect() and bind(). Perhaps seccomp init
> should be postponed to after all sockets have been created?

If you want to migrate your guest, you need to be able to
call connect() at an arbitrary point in the QEMU process'
lifecycle. So you can't avoid allowing connect(). Similarly
if you want to allow hotplug of NICs (and their backends)
then you need to have both bind() + connect() available.


Daniel
Daniel P. Berrangé June 13, 2012, 8:37 p.m. UTC | #4
On Wed, Jun 13, 2012 at 04:20:22PM -0300, Eduardo Otubo wrote:
> I added a syscall struct using priority levels as described in the
> libseccomp man page. The priority numbers are based to the frequency
> they appear in a sample strace from a regular qemu guest run under
> libvirt.
> 
> Libseccomp generates linear BPF code to filter system calls, those rules
> are read one after another. The priority system places the most common
> rules first in order to reduce the overhead when processing them.
> 
> Also, since this is just a first RFC, the whitelist is a little raw. We
> might need your help to improve, test and fine tune the set of system
> calls.
> 
> v2: Fixed some style issues
> 	Removed code from vl.c and created qemu-seccomp.[ch]
> 	Now using ARRAY_SIZE macro
> 	Added more syscalls without priority/frequency set yet
>
> +static struct QemuSeccompSyscall seccomp_whitelist[] = {
> +    { SCMP_SYS(timer_settime), 255 },
> +    { SCMP_SYS(timer_gettime), 254 },
> +    { SCMP_SYS(futex), 253 },
> +    { SCMP_SYS(select), 252 },
> +    { SCMP_SYS(recvfrom), 251 },
> +    { SCMP_SYS(sendto), 250 },
> +    { SCMP_SYS(read), 249 },
> +    { SCMP_SYS(brk), 248 },
> +    { SCMP_SYS(clone), 247 },
> +    { SCMP_SYS(mmap), 247 },
> +    { SCMP_SYS(mprotect), 246 },
> +    { SCMP_SYS(ioctl), 245 },
> +    { SCMP_SYS(recvmsg), 245 },
> +    { SCMP_SYS(sendmsg), 245 },
> +    { SCMP_SYS(accept), 245 },
> +    { SCMP_SYS(connect), 245 },
> +    { SCMP_SYS(bind), 245 },
> +    { SCMP_SYS(listen), 245 },
> +    { SCMP_SYS(ioctl), 245 },
> +    { SCMP_SYS(eventfd), 245 },
> +    { SCMP_SYS(rt_sigprocmask), 245 },
> +    { SCMP_SYS(write), 244 },
> +    { SCMP_SYS(fcntl), 243 },
> +    { SCMP_SYS(tgkill), 242 },
> +    { SCMP_SYS(rt_sigaction), 242 },
> +    { SCMP_SYS(pipe2), 242 },
> +    { SCMP_SYS(munmap), 242 },
> +    { SCMP_SYS(mremap), 242 },
> +    { SCMP_SYS(getsockname), 242 },
> +    { SCMP_SYS(getpeername), 242 },
> +    { SCMP_SYS(fdatasync), 242 },
> +    { SCMP_SYS(close), 242 }
> +};

Hmm, this is still missing the 'open' syscall. How does QEMU work
at all with these patches applied, without this syscall listed ?

Daniel
Blue Swirl June 15, 2012, 7:04 p.m. UTC | #5
On Wed, Jun 13, 2012 at 8:33 PM, Daniel P. Berrange <berrange@redhat.com> wrote:
> On Wed, Jun 13, 2012 at 07:56:06PM +0000, Blue Swirl wrote:
>> On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo <otubo@linux.vnet.ibm.com> wrote:
>> > I added a syscall struct using priority levels as described in the
>> > libseccomp man page. The priority numbers are based to the frequency
>> > they appear in a sample strace from a regular qemu guest run under
>> > libvirt.
>> >
>> > Libseccomp generates linear BPF code to filter system calls, those rules
>> > are read one after another. The priority system places the most common
>> > rules first in order to reduce the overhead when processing them.
>> >
>> > Also, since this is just a first RFC, the whitelist is a little raw. We
>> > might need your help to improve, test and fine tune the set of system
>> > calls.
>> >
>> > v2: Fixed some style issues
>> >        Removed code from vl.c and created qemu-seccomp.[ch]
>> >        Now using ARRAY_SIZE macro
>> >        Added more syscalls without priority/frequency set yet
>> >
>> > Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
>> > ---
>> >  qemu-seccomp.c |   73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> >  qemu-seccomp.h |    9 +++++++
>> >  vl.c           |    7 ++++++
>> >  3 files changed, 89 insertions(+)
>> >  create mode 100644 qemu-seccomp.c
>> >  create mode 100644 qemu-seccomp.h
>> >
>> > diff --git a/qemu-seccomp.c b/qemu-seccomp.c
>> > new file mode 100644
>> > index 0000000..048b7ba
>> > --- /dev/null
>> > +++ b/qemu-seccomp.c
>> > @@ -0,0 +1,73 @@
>>
>> Copyright and license info missing.
>>
>> > +#include <stdio.h>
>> > +#include <seccomp.h>
>> > +#include "qemu-seccomp.h"
>> > +
>> > +static struct QemuSeccompSyscall seccomp_whitelist[] = {
>>
>> 'const'
>>
>> > +    { SCMP_SYS(timer_settime), 255 },
>> > +    { SCMP_SYS(timer_gettime), 254 },
>> > +    { SCMP_SYS(futex), 253 },
>> > +    { SCMP_SYS(select), 252 },
>> > +    { SCMP_SYS(recvfrom), 251 },
>> > +    { SCMP_SYS(sendto), 250 },
>> > +    { SCMP_SYS(read), 249 },
>> > +    { SCMP_SYS(brk), 248 },
>> > +    { SCMP_SYS(clone), 247 },
>> > +    { SCMP_SYS(mmap), 247 },
>> > +    { SCMP_SYS(mprotect), 246 },
>> > +    { SCMP_SYS(ioctl), 245 },
>> > +    { SCMP_SYS(recvmsg), 245 },
>> > +    { SCMP_SYS(sendmsg), 245 },
>> > +    { SCMP_SYS(accept), 245 },
>> > +    { SCMP_SYS(connect), 245 },
>> > +    { SCMP_SYS(bind), 245 },
>>
>> It would be nice to avoid connect() and bind(). Perhaps seccomp init
>> should be postponed to after all sockets have been created?
>
> If you want to migrate your guest, you need to be able to
> call connect() at an arbitrary point in the QEMU process'
> lifecycle. So you can't avoid allowing connect(). Similarly
> if you want to allow hotplug of NICs (and their backends)
> then you need to have both bind() + connect() available.

That's bad. Migration could conceivably be extended to use file
descriptor passing, but hotplug is more tricky.

It's probably easier to split QEMU to two processes, one with greater
rights like bind(), connect() etc. and another with tighter set.

>
>
> Daniel
> --
> |: http://berrange.com      -o-    http://www.flickr.com/photos/dberrange/ :|
> |: http://libvirt.org              -o-             http://virt-manager.org :|
> |: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
> |: http://entangle-photo.org       -o-       http://live.gnome.org/gtk-vnc :|
Blue Swirl June 15, 2012, 7:06 p.m. UTC | #6
On Wed, Jun 13, 2012 at 8:30 PM, Daniel P. Berrange <berrange@redhat.com> wrote:
> On Wed, Jun 13, 2012 at 04:20:22PM -0300, Eduardo Otubo wrote:
>> I added a syscall struct using priority levels as described in the
>> libseccomp man page. The priority numbers are based to the frequency
>> they appear in a sample strace from a regular qemu guest run under
>> libvirt.
>>
>> Libseccomp generates linear BPF code to filter system calls, those rules
>> are read one after another. The priority system places the most common
>> rules first in order to reduce the overhead when processing them.
>>
>> Also, since this is just a first RFC, the whitelist is a little raw. We
>> might need your help to improve, test and fine tune the set of system
>> calls.
>>
>> v2: Fixed some style issues
>>       Removed code from vl.c and created qemu-seccomp.[ch]
>>       Now using ARRAY_SIZE macro
>>       Added more syscalls without priority/frequency set yet
>>
>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
>> ---
>>  qemu-seccomp.c |   73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  qemu-seccomp.h |    9 +++++++
>>  vl.c           |    7 ++++++
>>  3 files changed, 89 insertions(+)
>>  create mode 100644 qemu-seccomp.c
>>  create mode 100644 qemu-seccomp.h
>>
>> diff --git a/qemu-seccomp.c b/qemu-seccomp.c
>> new file mode 100644
>> index 0000000..048b7ba
>> --- /dev/null
>> +++ b/qemu-seccomp.c
>> @@ -0,0 +1,73 @@
>> +#include <stdio.h>
>> +#include <seccomp.h>
>> +#include "qemu-seccomp.h"
>> +
>> +static struct QemuSeccompSyscall seccomp_whitelist[] = {
>> +    { SCMP_SYS(timer_settime), 255 },
>> +    { SCMP_SYS(timer_gettime), 254 },
>> +    { SCMP_SYS(futex), 253 },
>> +    { SCMP_SYS(select), 252 },
>> +    { SCMP_SYS(recvfrom), 251 },
>> +    { SCMP_SYS(sendto), 250 },
>> +    { SCMP_SYS(read), 249 },
>> +    { SCMP_SYS(brk), 248 },
>> +    { SCMP_SYS(clone), 247 },
>> +    { SCMP_SYS(mmap), 247 },
>> +    { SCMP_SYS(mprotect), 246 },
>> +    { SCMP_SYS(ioctl), 245 },
>> +    { SCMP_SYS(recvmsg), 245 },
>> +    { SCMP_SYS(sendmsg), 245 },
>> +    { SCMP_SYS(accept), 245 },
>> +    { SCMP_SYS(connect), 245 },
>> +    { SCMP_SYS(bind), 245 },
>> +    { SCMP_SYS(listen), 245 },
>> +    { SCMP_SYS(ioctl), 245 },
>> +    { SCMP_SYS(eventfd), 245 },
>> +    { SCMP_SYS(rt_sigprocmask), 245 },
>> +    { SCMP_SYS(write), 244 },
>> +    { SCMP_SYS(fcntl), 243 },
>> +    { SCMP_SYS(tgkill), 242 },
>> +    { SCMP_SYS(rt_sigaction), 242 },
>> +    { SCMP_SYS(pipe2), 242 },
>> +    { SCMP_SYS(munmap), 242 },
>> +    { SCMP_SYS(mremap), 242 },
>> +    { SCMP_SYS(getsockname), 242 },
>> +    { SCMP_SYS(getpeername), 242 },
>> +    { SCMP_SYS(fdatasync), 242 },
>> +    { SCMP_SYS(close), 242 }
>
> execve(), so QEMU can run things like the ifup/down
> scripts, the samba daemon (sic), exec: migration protocol,
> etc, etc

I think allowing execve() would render seccomp pretty much useless.

>
>
> Daniel
> --
> |: http://berrange.com      -o-    http://www.flickr.com/photos/dberrange/ :|
> |: http://libvirt.org              -o-             http://virt-manager.org :|
> |: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
> |: http://entangle-photo.org       -o-       http://live.gnome.org/gtk-vnc :|
>
Paul Moore June 15, 2012, 9:02 p.m. UTC | #7
On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
> I think allowing execve() would render seccomp pretty much useless.

Not necessarily.

I'll agree that it does seem a bit odd to allow execve(), but there is still 
value in enabling seccomp to disable potentially buggy/exploitable syscalls.  
Let's not forget that we have over 300 syscalls on x86_64, not including the 
32 bit versions, and even if we add all of the new syscalls suggested in this 
thread we are still talking about a small subset of syscalls.  As far as 
security goes, the old adage of "less is more" applies.

Protecting against the abuse and misuse of execve() is something that is 
better done with the host's access controls (traditional DAC, MAC via the LSM, 
etc.).
Blue Swirl June 15, 2012, 9:23 p.m. UTC | #8
On Fri, Jun 15, 2012 at 9:02 PM, Paul Moore <pmoore@redhat.com> wrote:
> On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
>> I think allowing execve() would render seccomp pretty much useless.
>
> Not necessarily.
>
> I'll agree that it does seem a bit odd to allow execve(), but there is still
> value in enabling seccomp to disable potentially buggy/exploitable syscalls.
> Let's not forget that we have over 300 syscalls on x86_64, not including the
> 32 bit versions, and even if we add all of the new syscalls suggested in this
> thread we are still talking about a small subset of syscalls.  As far as
> security goes, the old adage of "less is more" applies.

The helper program being executed could need any of the 300 system
calls, so we'd have to allow all.

>
> Protecting against the abuse and misuse of execve() is something that is
> better done with the host's access controls (traditional DAC, MAC via the LSM,
> etc.).

How about seccomp mode selected by command line switch -seccomp, in
which bind/connect/open/execve are forbidden? The functionality
remaining would be somewhat limited (can't migrate or use SMB etc.
until refactoring of QEMU), but that way seccomp jail would be much
tighter.

>
> --
> paul moore
> security and virtualization @ redhat
>
Paul Moore June 15, 2012, 9:36 p.m. UTC | #9
On Friday, June 15, 2012 09:23:46 PM Blue Swirl wrote:
> On Fri, Jun 15, 2012 at 9:02 PM, Paul Moore <pmoore@redhat.com> wrote:
> > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
> >> I think allowing execve() would render seccomp pretty much useless.
> > 
> > Not necessarily.
> > 
> > I'll agree that it does seem a bit odd to allow execve(), but there is
> > still value in enabling seccomp to disable potentially buggy/exploitable
> > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not
> > including the 32 bit versions, and even if we add all of the new syscalls
> > suggested in this thread we are still talking about a small subset of
> > syscalls.  As far as security goes, the old adage of "less is more"
> > applies.
> 
> The helper program being executed could need any of the 300 system
> calls, so we'd have to allow all.

Don't we have some basic understanding of what the applications being exec'd 
will need to do?  I sorta see your point, but allowing the entire set of 
syscalls seems a bit dramatic.

> > Protecting against the abuse and misuse of execve() is something that is
> > better done with the host's access controls (traditional DAC, MAC via the
> > LSM, etc.).
> 
> How about seccomp mode selected by command line switch -seccomp, in
> which bind/connect/open/execve are forbidden? The functionality
> remaining would be somewhat limited (can't migrate or use SMB etc.
> until refactoring of QEMU), but that way seccomp jail would be much
> tighter.

When I spoke to Anthony about this earlier (offline, sorry) he was opposed to 
requiring any switches or user interaction to enable seccomp.  I'm not sure if 
his stance on this has changed any over the past few months.

In my perfect world, we would have a decomposed QEMU that functions as a 
series of processes connected via some sort of IPC; the exact divisions are a 
bit TBD and beyond the scope of this discussion.  In this scenario we would be 
able to restrict QEMU with sVirt and seccomp to a much higher degree than we 
could with the current monolithic QEMU.

I don't expect to see my perfect world any time soon, but in the meantime we 
can still improve the security of QEMU on Linux with these seccomp patches and 
for that reason I think it's a win.  Since these patches don't expose anything 
at runtime (no knobs, switches, etc.) we leave ourselves plenty of flexibility 
for changing things in the future.
Eric Blake June 15, 2012, 9:44 p.m. UTC | #10
On 06/15/2012 03:23 PM, Blue Swirl wrote:

> How about seccomp mode selected by command line switch -seccomp, in
> which bind/connect/open/execve are forbidden? The functionality
> remaining would be somewhat limited (can't migrate or use SMB etc.

More properly, can't migrate with exec:command migration.  But fd:nnn
migration should still be viable.
Blue Swirl June 16, 2012, 6:46 a.m. UTC | #11
On Fri, Jun 15, 2012 at 9:36 PM, Paul Moore <pmoore@redhat.com> wrote:
> On Friday, June 15, 2012 09:23:46 PM Blue Swirl wrote:
>> On Fri, Jun 15, 2012 at 9:02 PM, Paul Moore <pmoore@redhat.com> wrote:
>> > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
>> >> I think allowing execve() would render seccomp pretty much useless.
>> >
>> > Not necessarily.
>> >
>> > I'll agree that it does seem a bit odd to allow execve(), but there is
>> > still value in enabling seccomp to disable potentially buggy/exploitable
>> > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not
>> > including the 32 bit versions, and even if we add all of the new syscalls
>> > suggested in this thread we are still talking about a small subset of
>> > syscalls.  As far as security goes, the old adage of "less is more"
>> > applies.
>>
>> The helper program being executed could need any of the 300 system
>> calls, so we'd have to allow all.
>
> Don't we have some basic understanding of what the applications being exec'd
> will need to do?  I sorta see your point, but allowing the entire set of
> syscalls seems a bit dramatic.

At least qemu-ifup/down scripts, migration exec and smbd have been
mentioned. Only the system calls made by smbd (for some version of it)
can be known. The user could specify arbitrary commands for the
others, those could be assumed to use some common (large) subset of
system calls but I think the security value would be close to zero
then.

>
>> > Protecting against the abuse and misuse of execve() is something that is
>> > better done with the host's access controls (traditional DAC, MAC via the
>> > LSM, etc.).
>>
>> How about seccomp mode selected by command line switch -seccomp, in
>> which bind/connect/open/execve are forbidden? The functionality
>> remaining would be somewhat limited (can't migrate or use SMB etc.
>> until refactoring of QEMU), but that way seccomp jail would be much
>> tighter.
>
> When I spoke to Anthony about this earlier (offline, sorry) he was opposed to
> requiring any switches or user interaction to enable seccomp.  I'm not sure if
> his stance on this has changed any over the past few months.

There could be two modes, strict mode (-seccomp) and default mode
(only some syscalls blocked). With the future decomposed QEMU, strict
seccomp mode would be default and the switch would be obsoleted. If
the decomposition is planned to happen soonish, adding the switch
would be just churn.

>
> In my perfect world, we would have a decomposed QEMU that functions as a
> series of processes connected via some sort of IPC; the exact divisions are a
> bit TBD and beyond the scope of this discussion.  In this scenario we would be
> able to restrict QEMU with sVirt and seccomp to a much higher degree than we
> could with the current monolithic QEMU.
>
> I don't expect to see my perfect world any time soon, but in the meantime we
> can still improve the security of QEMU on Linux with these seccomp patches and
> for that reason I think it's a win.  Since these patches don't expose anything
> at runtime (no knobs, switches, etc.) we leave ourselves plenty of flexibility
> for changing things in the future.

Yes, I'm much in favor of adding seccomp support soon. But I just
wonder if this is really the best level of security we can reach now,
not assuming decomposed QEMU, but just minor tweaks?

>
> --
> paul moore
> security and virtualization @ redhat
>
Daniel P. Berrangé June 18, 2012, 8:26 a.m. UTC | #12
On Fri, Jun 15, 2012 at 07:06:10PM +0000, Blue Swirl wrote:
> On Wed, Jun 13, 2012 at 8:30 PM, Daniel P. Berrange <berrange@redhat.com> wrote:
> > On Wed, Jun 13, 2012 at 04:20:22PM -0300, Eduardo Otubo wrote:
> >> I added a syscall struct using priority levels as described in the
> >> libseccomp man page. The priority numbers are based to the frequency
> >> they appear in a sample strace from a regular qemu guest run under
> >> libvirt.
> >>
> >> Libseccomp generates linear BPF code to filter system calls, those rules
> >> are read one after another. The priority system places the most common
> >> rules first in order to reduce the overhead when processing them.
> >>
> >> Also, since this is just a first RFC, the whitelist is a little raw. We
> >> might need your help to improve, test and fine tune the set of system
> >> calls.
> >>
> >> v2: Fixed some style issues
> >>       Removed code from vl.c and created qemu-seccomp.[ch]
> >>       Now using ARRAY_SIZE macro
> >>       Added more syscalls without priority/frequency set yet
> >>
> >> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> >> ---
> >>  qemu-seccomp.c |   73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  qemu-seccomp.h |    9 +++++++
> >>  vl.c           |    7 ++++++
> >>  3 files changed, 89 insertions(+)
> >>  create mode 100644 qemu-seccomp.c
> >>  create mode 100644 qemu-seccomp.h
> >>
> >> diff --git a/qemu-seccomp.c b/qemu-seccomp.c
> >> new file mode 100644
> >> index 0000000..048b7ba
> >> --- /dev/null
> >> +++ b/qemu-seccomp.c
> >> @@ -0,0 +1,73 @@
> >> +#include <stdio.h>
> >> +#include <seccomp.h>
> >> +#include "qemu-seccomp.h"
> >> +
> >> +static struct QemuSeccompSyscall seccomp_whitelist[] = {
> >> +    { SCMP_SYS(timer_settime), 255 },
> >> +    { SCMP_SYS(timer_gettime), 254 },
> >> +    { SCMP_SYS(futex), 253 },
> >> +    { SCMP_SYS(select), 252 },
> >> +    { SCMP_SYS(recvfrom), 251 },
> >> +    { SCMP_SYS(sendto), 250 },
> >> +    { SCMP_SYS(read), 249 },
> >> +    { SCMP_SYS(brk), 248 },
> >> +    { SCMP_SYS(clone), 247 },
> >> +    { SCMP_SYS(mmap), 247 },
> >> +    { SCMP_SYS(mprotect), 246 },
> >> +    { SCMP_SYS(ioctl), 245 },
> >> +    { SCMP_SYS(recvmsg), 245 },
> >> +    { SCMP_SYS(sendmsg), 245 },
> >> +    { SCMP_SYS(accept), 245 },
> >> +    { SCMP_SYS(connect), 245 },
> >> +    { SCMP_SYS(bind), 245 },
> >> +    { SCMP_SYS(listen), 245 },
> >> +    { SCMP_SYS(ioctl), 245 },
> >> +    { SCMP_SYS(eventfd), 245 },
> >> +    { SCMP_SYS(rt_sigprocmask), 245 },
> >> +    { SCMP_SYS(write), 244 },
> >> +    { SCMP_SYS(fcntl), 243 },
> >> +    { SCMP_SYS(tgkill), 242 },
> >> +    { SCMP_SYS(rt_sigaction), 242 },
> >> +    { SCMP_SYS(pipe2), 242 },
> >> +    { SCMP_SYS(munmap), 242 },
> >> +    { SCMP_SYS(mremap), 242 },
> >> +    { SCMP_SYS(getsockname), 242 },
> >> +    { SCMP_SYS(getpeername), 242 },
> >> +    { SCMP_SYS(fdatasync), 242 },
> >> +    { SCMP_SYS(close), 242 }
> >
> > execve(), so QEMU can run things like the ifup/down
> > scripts, the samba daemon (sic), exec: migration protocol,
> > etc, etc
> 
> I think allowing execve() would render seccomp pretty much useless.

So do I, but in the previous posting it was stated[1] that the
intent is to allow all syscalls QEMU needs, and not have
any loss of current functionality. Hence I'm reporting all syscalls
that are missing that QEMU needs.


Daniel

[1]  https://lists.gnu.org/archive/html/qemu-devel/2012-05/msg00928.html
Daniel P. Berrangé June 18, 2012, 8:31 a.m. UTC | #13
On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote:
> On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
> > I think allowing execve() would render seccomp pretty much useless.
> 
> Not necessarily.
> 
> I'll agree that it does seem a bit odd to allow execve(), but there is still 
> value in enabling seccomp to disable potentially buggy/exploitable syscalls.  
> Let's not forget that we have over 300 syscalls on x86_64, not including the 
> 32 bit versions, and even if we add all of the new syscalls suggested in this 
> thread we are still talking about a small subset of syscalls.  As far as 
> security goes, the old adage of "less is more" applies.

I can sort of see this argument, but *only* if the QEMU process is being
run under a dedicated, fully unprivileged (from a DAC pov) user, completely
separate from anything else on the system.

If QEMU were being run as root, then even with seccomp, it could trivially
just overwrite some binary in /bin, update /proc/core-pattern to point to
this binary, and then crash itself. Now that core handling binary will
execute without any of the seccomp filters applied.

Similarly if QEMU is being run in the user's desktop session, I'm sure there
is some kind of similar attack possible by changing a config setting for the
user's GNOME/KDE session, and then waiting for GNOME/KDE to execute the script
that QEMU just wrote out, once again bypassing seccomp.

Regards,
Daniel
Daniel P. Berrangé June 18, 2012, 8:33 a.m. UTC | #14
On Fri, Jun 15, 2012 at 07:04:45PM +0000, Blue Swirl wrote:
> On Wed, Jun 13, 2012 at 8:33 PM, Daniel P. Berrange <berrange@redhat.com> wrote:
> > On Wed, Jun 13, 2012 at 07:56:06PM +0000, Blue Swirl wrote:
> >> On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo <otubo@linux.vnet.ibm.com> wrote:
> >> > I added a syscall struct using priority levels as described in the
> >> > libseccomp man page. The priority numbers are based to the frequency
> >> > they appear in a sample strace from a regular qemu guest run under
> >> > libvirt.
> >> >
> >> > Libseccomp generates linear BPF code to filter system calls, those rules
> >> > are read one after another. The priority system places the most common
> >> > rules first in order to reduce the overhead when processing them.
> >> >
> >> > Also, since this is just a first RFC, the whitelist is a little raw. We
> >> > might need your help to improve, test and fine tune the set of system
> >> > calls.
> >> >
> >> > v2: Fixed some style issues
> >> >        Removed code from vl.c and created qemu-seccomp.[ch]
> >> >        Now using ARRAY_SIZE macro
> >> >        Added more syscalls without priority/frequency set yet
> >> >
> >> > Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> >> > ---
> >> >  qemu-seccomp.c |   73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >> >  qemu-seccomp.h |    9 +++++++
> >> >  vl.c           |    7 ++++++
> >> >  3 files changed, 89 insertions(+)
> >> >  create mode 100644 qemu-seccomp.c
> >> >  create mode 100644 qemu-seccomp.h
> >> >
> >> > diff --git a/qemu-seccomp.c b/qemu-seccomp.c
> >> > new file mode 100644
> >> > index 0000000..048b7ba
> >> > --- /dev/null
> >> > +++ b/qemu-seccomp.c
> >> > @@ -0,0 +1,73 @@
> >>
> >> Copyright and license info missing.
> >>
> >> > +#include <stdio.h>
> >> > +#include <seccomp.h>
> >> > +#include "qemu-seccomp.h"
> >> > +
> >> > +static struct QemuSeccompSyscall seccomp_whitelist[] = {
> >>
> >> 'const'
> >>
> >> > +    { SCMP_SYS(timer_settime), 255 },
> >> > +    { SCMP_SYS(timer_gettime), 254 },
> >> > +    { SCMP_SYS(futex), 253 },
> >> > +    { SCMP_SYS(select), 252 },
> >> > +    { SCMP_SYS(recvfrom), 251 },
> >> > +    { SCMP_SYS(sendto), 250 },
> >> > +    { SCMP_SYS(read), 249 },
> >> > +    { SCMP_SYS(brk), 248 },
> >> > +    { SCMP_SYS(clone), 247 },
> >> > +    { SCMP_SYS(mmap), 247 },
> >> > +    { SCMP_SYS(mprotect), 246 },
> >> > +    { SCMP_SYS(ioctl), 245 },
> >> > +    { SCMP_SYS(recvmsg), 245 },
> >> > +    { SCMP_SYS(sendmsg), 245 },
> >> > +    { SCMP_SYS(accept), 245 },
> >> > +    { SCMP_SYS(connect), 245 },
> >> > +    { SCMP_SYS(bind), 245 },
> >>
> >> It would be nice to avoid connect() and bind(). Perhaps seccomp init
> >> should be postponed to after all sockets have been created?
> >
> > If you want to migrate your guest, you need to be able to
> > call connect() at an arbitrary point in the QEMU process'
> > lifecycle. So you can't avoid allowing connect(). Similarly
> > if you want to allow hotplug of NICs (and their backends)
> > then you need to have both bind() + connect() available.
> 
> That's bad. Migration could conceivably be extended to use file
> descriptor passing, but hotplug is more tricky.

As with execve(), i'm reporting this on the basis that on the previous
patch posting I was told we must whitelist any syscalls QEMU can
conceivably use to avoid any loss in functionality.


Daniel
Daniel P. Berrangé June 18, 2012, 8:38 a.m. UTC | #15
On Mon, Jun 18, 2012 at 09:31:03AM +0100, Daniel P. Berrange wrote:
> On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote:
> > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
> > > I think allowing execve() would render seccomp pretty much useless.
> > 
> > Not necessarily.
> > 
> > I'll agree that it does seem a bit odd to allow execve(), but there is still 
> > value in enabling seccomp to disable potentially buggy/exploitable syscalls.  
> > Let's not forget that we have over 300 syscalls on x86_64, not including the 
> > 32 bit versions, and even if we add all of the new syscalls suggested in this 
> > thread we are still talking about a small subset of syscalls.  As far as 
> > security goes, the old adage of "less is more" applies.
> 
> I can sort of see this argument, but *only* if the QEMU process is being
> run under a dedicated, fully unprivileged (from a DAC pov) user, completely
> separate from anything else on the system.

Or, of course, for a QEMU already confined by SELinux.

> If QEMU were being run as root, then even with seccomp, it could trivially
> just overwrite some binary in /bin, update /proc/core-pattern to point to
> this binary, and then crash itself. Now that core handling binary will
> execute without any of the seccomp filters applied.
> 
> Similarly if QEMU is being run in the user's desktop session, I'm sure there
> is some kind of similar attack possible by changing a config setting for the
> user's GNOME/KDE session, and then waiting for GNOME/KDE to execute the script
> that QEMU just wrote out, once again bypassing seccomp.


Daniel
Paul Moore June 18, 2012, 1:52 p.m. UTC | #16
On Monday, June 18, 2012 09:31:03 AM Daniel P. Berrange wrote:
> On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote:
> > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
> > > I think allowing execve() would render seccomp pretty much useless.
> > 
> > Not necessarily.
> > 
> > I'll agree that it does seem a bit odd to allow execve(), but there is
> > still value in enabling seccomp to disable potentially buggy/exploitable
> > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not
> > including the 32 bit versions, and even if we add all of the new syscalls
> > suggested in this thread we are still talking about a small subset of
> > syscalls.  As far as security goes, the old adage of "less is more"
> > applies.
> 
> I can sort of see this argument, but *only* if the QEMU process is being
> run under a dedicated, fully unprivileged (from a DAC pov) user, completely
> separate from anything else on the system.
>
> Or, of course, for a QEMU already confined by SELinux.

Agreed ... and considering at least one major distribution takes this approach 
it seems like reasonable functionality to me.  Confining QEMU, either through 
DAC and/or MAC, when faced with potentially malicious guests is just good 
sense.
Daniel P. Berrangé June 18, 2012, 1:55 p.m. UTC | #17
On Mon, Jun 18, 2012 at 09:52:44AM -0400, Paul Moore wrote:
> On Monday, June 18, 2012 09:31:03 AM Daniel P. Berrange wrote:
> > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote:
> > > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
> > > > I think allowing execve() would render seccomp pretty much useless.
> > > 
> > > Not necessarily.
> > > 
> > > I'll agree that it does seem a bit odd to allow execve(), but there is
> > > still value in enabling seccomp to disable potentially buggy/exploitable
> > > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not
> > > including the 32 bit versions, and even if we add all of the new syscalls
> > > suggested in this thread we are still talking about a small subset of
> > > syscalls.  As far as security goes, the old adage of "less is more"
> > > applies.
> > 
> > I can sort of see this argument, but *only* if the QEMU process is being
> > run under a dedicated, fully unprivileged (from a DAC pov) user, completely
> > separate from anything else on the system.
> >
> > Or, of course, for a QEMU already confined by SELinux.
> 
> Agreed ... and considering at least one major distribution takes this approach 
> it seems like reasonable functionality to me.  Confining QEMU, either through 
> DAC and/or MAC, when faced with potentially malicious guests is just good 
> sense.

Good, I'm not missing anything then. I'd suggest that future iterations
of these patches explicitly mention the deployment scenarios in which
this technology is able to offer increases security, and also describe
the scenarios where it will not improve things.

Regards,
Daniel
Paul Moore June 18, 2012, 2:02 p.m. UTC | #18
On Monday, June 18, 2012 02:55:35 PM Daniel P. Berrange wrote:
> On Mon, Jun 18, 2012 at 09:52:44AM -0400, Paul Moore wrote:
> > On Monday, June 18, 2012 09:31:03 AM Daniel P. Berrange wrote:
> > > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote:
> > > > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
> > > > > I think allowing execve() would render seccomp pretty much useless.
> > > > 
> > > > Not necessarily.
> > > > 
> > > > I'll agree that it does seem a bit odd to allow execve(), but there is
> > > > still value in enabling seccomp to disable potentially
> > > > buggy/exploitable
> > > > syscalls. Let's not forget that we have over 300 syscalls on x86_64,
> > > > not
> > > > including the 32 bit versions, and even if we add all of the new
> > > > syscalls
> > > > suggested in this thread we are still talking about a small subset of
> > > > syscalls.  As far as security goes, the old adage of "less is more"
> > > > applies.
> > > 
> > > I can sort of see this argument, but *only* if the QEMU process is being
> > > run under a dedicated, fully unprivileged (from a DAC pov) user,
> > > completely
> > > separate from anything else on the system.
> > > 
> > > Or, of course, for a QEMU already confined by SELinux.
> > 
> > Agreed ... and considering at least one major distribution takes this
> > approach it seems like reasonable functionality to me.  Confining QEMU,
> > either through DAC and/or MAC, when faced with potentially malicious
> > guests is just good sense.
> 
> Good, I'm not missing anything then. I'd suggest that future iterations
> of these patches explicitly mention the deployment scenarios in which
> this technology is able to offer increases security, and also describe
> the scenarios where it will not improve things.

Sounds like a reasonable request to me.
Corey Bryant June 18, 2012, 3:22 p.m. UTC | #19
On 06/18/2012 04:33 AM, Daniel P. Berrange wrote:
> On Fri, Jun 15, 2012 at 07:04:45PM +0000, Blue Swirl wrote:
>> On Wed, Jun 13, 2012 at 8:33 PM, Daniel P. Berrange <berrange@redhat.com> wrote:
>>> On Wed, Jun 13, 2012 at 07:56:06PM +0000, Blue Swirl wrote:
>>>> On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo <otubo@linux.vnet.ibm.com> wrote:
>>>>> I added a syscall struct using priority levels as described in the
>>>>> libseccomp man page. The priority numbers are based to the frequency
>>>>> they appear in a sample strace from a regular qemu guest run under
>>>>> libvirt.
>>>>>
>>>>> Libseccomp generates linear BPF code to filter system calls, those rules
>>>>> are read one after another. The priority system places the most common
>>>>> rules first in order to reduce the overhead when processing them.
>>>>>
>>>>> Also, since this is just a first RFC, the whitelist is a little raw. We
>>>>> might need your help to improve, test and fine tune the set of system
>>>>> calls.
>>>>>
>>>>> v2: Fixed some style issues
>>>>>         Removed code from vl.c and created qemu-seccomp.[ch]
>>>>>         Now using ARRAY_SIZE macro
>>>>>         Added more syscalls without priority/frequency set yet
>>>>>
>>>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
>>>>> ---
>>>>>   qemu-seccomp.c |   73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>   qemu-seccomp.h |    9 +++++++
>>>>>   vl.c           |    7 ++++++
>>>>>   3 files changed, 89 insertions(+)
>>>>>   create mode 100644 qemu-seccomp.c
>>>>>   create mode 100644 qemu-seccomp.h
>>>>>
>>>>> diff --git a/qemu-seccomp.c b/qemu-seccomp.c
>>>>> new file mode 100644
>>>>> index 0000000..048b7ba
>>>>> --- /dev/null
>>>>> +++ b/qemu-seccomp.c
>>>>> @@ -0,0 +1,73 @@
>>>>
>>>> Copyright and license info missing.
>>>>
>>>>> +#include <stdio.h>
>>>>> +#include <seccomp.h>
>>>>> +#include "qemu-seccomp.h"
>>>>> +
>>>>> +static struct QemuSeccompSyscall seccomp_whitelist[] = {
>>>>
>>>> 'const'
>>>>
>>>>> +    { SCMP_SYS(timer_settime), 255 },
>>>>> +    { SCMP_SYS(timer_gettime), 254 },
>>>>> +    { SCMP_SYS(futex), 253 },
>>>>> +    { SCMP_SYS(select), 252 },
>>>>> +    { SCMP_SYS(recvfrom), 251 },
>>>>> +    { SCMP_SYS(sendto), 250 },
>>>>> +    { SCMP_SYS(read), 249 },
>>>>> +    { SCMP_SYS(brk), 248 },
>>>>> +    { SCMP_SYS(clone), 247 },
>>>>> +    { SCMP_SYS(mmap), 247 },
>>>>> +    { SCMP_SYS(mprotect), 246 },
>>>>> +    { SCMP_SYS(ioctl), 245 },
>>>>> +    { SCMP_SYS(recvmsg), 245 },
>>>>> +    { SCMP_SYS(sendmsg), 245 },
>>>>> +    { SCMP_SYS(accept), 245 },
>>>>> +    { SCMP_SYS(connect), 245 },
>>>>> +    { SCMP_SYS(bind), 245 },
>>>>
>>>> It would be nice to avoid connect() and bind(). Perhaps seccomp init
>>>> should be postponed to after all sockets have been created?
>>>
>>> If you want to migrate your guest, you need to be able to
>>> call connect() at an arbitrary point in the QEMU process'
>>> lifecycle. So you can't avoid allowing connect(). Similarly
>>> if you want to allow hotplug of NICs (and their backends)
>>> then you need to have both bind() + connect() available.
>>
>> That's bad. Migration could conceivably be extended to use file
>> descriptor passing, but hotplug is more tricky.
>
> As with execve(), i'm reporting this on the basis that on the previous
> patch posting I was told we must whitelist any syscalls QEMU can
> conceivably use to avoid any loss in functionality.

Thanks for pointing out syscalls needed for the whitelist.

As Paul has already mentioned, it was recommended that we restrict all 
of QEMU (as a single process) from the start of execution.  This is 
opposed to other options of restricting QEMU from the time that vCPUS 
start, further restricting based on syscall parms, or decomposing QEMU 
into multiple processes that are individually restricted with their own 
seccomp whitelists.

I think this approach is a good starting point that can be further tuned 
in the future.  And as with most security measures, defense in depth 
improves the cause (e.g. combining seccomp with DAC or MAC).
Corey Bryant June 18, 2012, 3:29 p.m. UTC | #20
On 06/18/2012 04:31 AM, Daniel P. Berrange wrote:
> On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote:
>> On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
>>> I think allowing execve() would render seccomp pretty much useless.
>>
>> Not necessarily.
>>
>> I'll agree that it does seem a bit odd to allow execve(), but there is still
>> value in enabling seccomp to disable potentially buggy/exploitable syscalls.
>> Let's not forget that we have over 300 syscalls on x86_64, not including the
>> 32 bit versions, and even if we add all of the new syscalls suggested in this
>> thread we are still talking about a small subset of syscalls.  As far as
>> security goes, the old adage of "less is more" applies.
>
> I can sort of see this argument, but *only* if the QEMU process is being
> run under a dedicated, fully unprivileged (from a DAC pov) user, completely
> separate from anything else on the system.

This might be a good point to plug Marcelo Cerri's DAC isolation patches 
that are on the libvirt mailing list. :)

http://www.redhat.com/archives/libvir-list/2012-May/msg01005.html
http://www.redhat.com/archives/libvir-list/2012-June/msg00020.html
Corey Bryant June 18, 2012, 5:41 p.m. UTC | #21
On 06/16/2012 02:46 AM, Blue Swirl wrote:
> On Fri, Jun 15, 2012 at 9:36 PM, Paul Moore <pmoore@redhat.com> wrote:
>> On Friday, June 15, 2012 09:23:46 PM Blue Swirl wrote:
>>> On Fri, Jun 15, 2012 at 9:02 PM, Paul Moore <pmoore@redhat.com> wrote:
>>>> On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
>>>>> I think allowing execve() would render seccomp pretty much useless.
>>>>
>>>> Not necessarily.
>>>>
>>>> I'll agree that it does seem a bit odd to allow execve(), but there is
>>>> still value in enabling seccomp to disable potentially buggy/exploitable
>>>> syscalls. Let's not forget that we have over 300 syscalls on x86_64, not
>>>> including the 32 bit versions, and even if we add all of the new syscalls
>>>> suggested in this thread we are still talking about a small subset of
>>>> syscalls.  As far as security goes, the old adage of "less is more"
>>>> applies.
>>>
>>> The helper program being executed could need any of the 300 system
>>> calls, so we'd have to allow all.
>>
>> Don't we have some basic understanding of what the applications being exec'd
>> will need to do?  I sorta see your point, but allowing the entire set of
>> syscalls seems a bit dramatic.
>
> At least qemu-ifup/down scripts, migration exec and smbd have been
> mentioned. Only the system calls made by smbd (for some version of it)
> can be known. The user could specify arbitrary commands for the
> others, those could be assumed to use some common (large) subset of
> system calls but I think the security value would be close to zero
> then.
>

I think at some point in the future we'll want to implement seccomp 
whitelists for each process that QEMU exec's.  The bridge helper (-net 
bridge) is one that I'm familiar with.  It is fairly minimal in the 
syscalls that it requires.  It certainly doesn't need all the syscalls 
that QEMU needs.  It is already severely restricted by SELinux (and 
AppArmor soon), but restricting the syscall footprint would be a nice 
addition.
Eduardo Otubo June 18, 2012, 8:13 p.m. UTC | #22
On Mon, Jun 18, 2012 at 02:55:35PM +0100, Daniel P. Berrange wrote:
> On Mon, Jun 18, 2012 at 09:52:44AM -0400, Paul Moore wrote:
> > On Monday, June 18, 2012 09:31:03 AM Daniel P. Berrange wrote:
> > > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote:
> > > > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
> > > > > I think allowing execve() would render seccomp pretty much useless.
> > > > 
> > > > Not necessarily.
> > > > 
> > > > I'll agree that it does seem a bit odd to allow execve(), but there is
> > > > still value in enabling seccomp to disable potentially buggy/exploitable
> > > > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not
> > > > including the 32 bit versions, and even if we add all of the new syscalls
> > > > suggested in this thread we are still talking about a small subset of
> > > > syscalls.  As far as security goes, the old adage of "less is more"
> > > > applies.
> > > 
> > > I can sort of see this argument, but *only* if the QEMU process is being
> > > run under a dedicated, fully unprivileged (from a DAC pov) user, completely
> > > separate from anything else on the system.
> > >
> > > Or, of course, for a QEMU already confined by SELinux.
> > 
> > Agreed ... and considering at least one major distribution takes this approach 
> > it seems like reasonable functionality to me.  Confining QEMU, either through 
> > DAC and/or MAC, when faced with potentially malicious guests is just good 
> > sense.
> 
> Good, I'm not missing anything then. I'd suggest that future iterations
> of these patches explicitly mention the deployment scenarios in which
> this technology is able to offer increases security, and also describe
> the scenarios where it will not improve things.

Please correct me if I'm wrong here, but I don't understand how exactly
whitelisting execve() is odd. The white list is inherit and passed along
the child processes so they also need to have their own syscalls filtered
by BPF in the kernel as stated in the Will's commit log[1] - "Filter
programs will be inherited across fork/clone and execve." - I wonder if
this is main point of your concern. Whitelisting execve() or not should be
no difference from the security pov.

However, I agree that a possible future feature could a customized
whitelist for each child process spawned. But for a first instance, the
default whitelist should be enough to start seccomp support in Qemu.

Also, as far as I understand, seccomp never meant to replace any of the
technologies above mentioned. Using more than one layer of protection
(SELinux, AppArmor MAC policy and/or DAC) should always be a good practice
for the defense in depth.

[1] -
http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=e2cfabdfd075648216f99c2c03821cf3f47c1727
Blue Swirl June 18, 2012, 8:15 p.m. UTC | #23
On Mon, Jun 18, 2012 at 8:31 AM, Daniel P. Berrange <berrange@redhat.com> wrote:
> On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote:
>> On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
>> > I think allowing execve() would render seccomp pretty much useless.
>>
>> Not necessarily.
>>
>> I'll agree that it does seem a bit odd to allow execve(), but there is still
>> value in enabling seccomp to disable potentially buggy/exploitable syscalls.
>> Let's not forget that we have over 300 syscalls on x86_64, not including the
>> 32 bit versions, and even if we add all of the new syscalls suggested in this
>> thread we are still talking about a small subset of syscalls.  As far as
>> security goes, the old adage of "less is more" applies.
>
> I can sort of see this argument, but *only* if the QEMU process is being
> run under a dedicated, fully unprivileged (from a DAC pov) user, completely
> separate from anything else on the system.
>
> If QEMU were being run as root, then even with seccomp, it could trivially
> just overwrite some binary in /bin, update /proc/core-pattern to point to

Not wiithout 'open'. When run as root, it would be nice to chroot()
also to some empty directory and then drop chroot() privileges.

> this binary, and then crash itself. Now that core handling binary will
> execute without any of the seccomp filters applied.
>
> Similarly if QEMU is being run in the user's desktop session, I'm sure there
> is some kind of similar attack possible by changing a config setting for the
> user's GNOME/KDE session, and then waiting for GNOME/KDE to execute the script
> that QEMU just wrote out, once again bypassing seccomp.
>
> Regards,
> Daniel
> --
> |: http://berrange.com      -o-    http://www.flickr.com/photos/dberrange/ :|
> |: http://libvirt.org              -o-             http://virt-manager.org :|
> |: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
> |: http://entangle-photo.org       -o-       http://live.gnome.org/gtk-vnc :|
Blue Swirl June 18, 2012, 8:18 p.m. UTC | #24
On Mon, Jun 18, 2012 at 3:22 PM, Corey Bryant <coreyb@linux.vnet.ibm.com> wrote:
>
>
> On 06/18/2012 04:33 AM, Daniel P. Berrange wrote:
>>
>> On Fri, Jun 15, 2012 at 07:04:45PM +0000, Blue Swirl wrote:
>>>
>>> On Wed, Jun 13, 2012 at 8:33 PM, Daniel P. Berrange <berrange@redhat.com>
>>> wrote:
>>>>
>>>> On Wed, Jun 13, 2012 at 07:56:06PM +0000, Blue Swirl wrote:
>>>>>
>>>>> On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo
>>>>> <otubo@linux.vnet.ibm.com> wrote:
>>>>>>
>>>>>> I added a syscall struct using priority levels as described in the
>>>>>> libseccomp man page. The priority numbers are based to the frequency
>>>>>> they appear in a sample strace from a regular qemu guest run under
>>>>>> libvirt.
>>>>>>
>>>>>> Libseccomp generates linear BPF code to filter system calls, those
>>>>>> rules
>>>>>> are read one after another. The priority system places the most common
>>>>>> rules first in order to reduce the overhead when processing them.
>>>>>>
>>>>>> Also, since this is just a first RFC, the whitelist is a little raw.
>>>>>> We
>>>>>> might need your help to improve, test and fine tune the set of system
>>>>>> calls.
>>>>>>
>>>>>> v2: Fixed some style issues
>>>>>>        Removed code from vl.c and created qemu-seccomp.[ch]
>>>>>>        Now using ARRAY_SIZE macro
>>>>>>        Added more syscalls without priority/frequency set yet
>>>>>>
>>>>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
>>>>>> ---
>>>>>>  qemu-seccomp.c |   73
>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>  qemu-seccomp.h |    9 +++++++
>>>>>>  vl.c           |    7 ++++++
>>>>>>  3 files changed, 89 insertions(+)
>>>>>>  create mode 100644 qemu-seccomp.c
>>>>>>  create mode 100644 qemu-seccomp.h
>>>>>>
>>>>>> diff --git a/qemu-seccomp.c b/qemu-seccomp.c
>>>>>> new file mode 100644
>>>>>> index 0000000..048b7ba
>>>>>> --- /dev/null
>>>>>> +++ b/qemu-seccomp.c
>>>>>> @@ -0,0 +1,73 @@
>>>>>
>>>>>
>>>>> Copyright and license info missing.
>>>>>
>>>>>> +#include <stdio.h>
>>>>>> +#include <seccomp.h>
>>>>>> +#include "qemu-seccomp.h"
>>>>>> +
>>>>>> +static struct QemuSeccompSyscall seccomp_whitelist[] = {
>>>>>
>>>>>
>>>>> 'const'
>>>>>
>>>>>> +    { SCMP_SYS(timer_settime), 255 },
>>>>>> +    { SCMP_SYS(timer_gettime), 254 },
>>>>>> +    { SCMP_SYS(futex), 253 },
>>>>>> +    { SCMP_SYS(select), 252 },
>>>>>> +    { SCMP_SYS(recvfrom), 251 },
>>>>>> +    { SCMP_SYS(sendto), 250 },
>>>>>> +    { SCMP_SYS(read), 249 },
>>>>>> +    { SCMP_SYS(brk), 248 },
>>>>>> +    { SCMP_SYS(clone), 247 },
>>>>>> +    { SCMP_SYS(mmap), 247 },
>>>>>> +    { SCMP_SYS(mprotect), 246 },
>>>>>> +    { SCMP_SYS(ioctl), 245 },
>>>>>> +    { SCMP_SYS(recvmsg), 245 },
>>>>>> +    { SCMP_SYS(sendmsg), 245 },
>>>>>> +    { SCMP_SYS(accept), 245 },
>>>>>> +    { SCMP_SYS(connect), 245 },
>>>>>> +    { SCMP_SYS(bind), 245 },
>>>>>
>>>>>
>>>>> It would be nice to avoid connect() and bind(). Perhaps seccomp init
>>>>> should be postponed to after all sockets have been created?
>>>>
>>>>
>>>> If you want to migrate your guest, you need to be able to
>>>> call connect() at an arbitrary point in the QEMU process'
>>>> lifecycle. So you can't avoid allowing connect(). Similarly
>>>> if you want to allow hotplug of NICs (and their backends)
>>>> then you need to have both bind() + connect() available.
>>>
>>>
>>> That's bad. Migration could conceivably be extended to use file
>>> descriptor passing, but hotplug is more tricky.
>>
>>
>> As with execve(), i'm reporting this on the basis that on the previous
>> patch posting I was told we must whitelist any syscalls QEMU can
>> conceivably use to avoid any loss in functionality.
>
>
> Thanks for pointing out syscalls needed for the whitelist.
>
> As Paul has already mentioned, it was recommended that we restrict all of
> QEMU (as a single process) from the start of execution.  This is opposed to
> other options of restricting QEMU from the time that vCPUS start, further
> restricting based on syscall parms, or decomposing QEMU into multiple
> processes that are individually restricted with their own seccomp
> whitelists.

Can each thread have separate seccomp whitelists? For example CPU
threads should not need pretty much anything but the I/O thread needs
I/O.

> I think this approach is a good starting point that can be further tuned in
> the future.  And as with most security measures, defense in depth improves
> the cause (e.g. combining seccomp with DAC or MAC).

Agreed.

>
> --
> Regards,
> Corey
>
>
Blue Swirl June 18, 2012, 8:23 p.m. UTC | #25
On Mon, Jun 18, 2012 at 8:13 PM, Eduardo Otubo <otubo@linux.vnet.ibm.com> wrote:
> On Mon, Jun 18, 2012 at 02:55:35PM +0100, Daniel P. Berrange wrote:
>> On Mon, Jun 18, 2012 at 09:52:44AM -0400, Paul Moore wrote:
>> > On Monday, June 18, 2012 09:31:03 AM Daniel P. Berrange wrote:
>> > > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote:
>> > > > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
>> > > > > I think allowing execve() would render seccomp pretty much useless.
>> > > >
>> > > > Not necessarily.
>> > > >
>> > > > I'll agree that it does seem a bit odd to allow execve(), but there is
>> > > > still value in enabling seccomp to disable potentially buggy/exploitable
>> > > > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not
>> > > > including the 32 bit versions, and even if we add all of the new syscalls
>> > > > suggested in this thread we are still talking about a small subset of
>> > > > syscalls.  As far as security goes, the old adage of "less is more"
>> > > > applies.
>> > >
>> > > I can sort of see this argument, but *only* if the QEMU process is being
>> > > run under a dedicated, fully unprivileged (from a DAC pov) user, completely
>> > > separate from anything else on the system.
>> > >
>> > > Or, of course, for a QEMU already confined by SELinux.
>> >
>> > Agreed ... and considering at least one major distribution takes this approach
>> > it seems like reasonable functionality to me.  Confining QEMU, either through
>> > DAC and/or MAC, when faced with potentially malicious guests is just good
>> > sense.
>>
>> Good, I'm not missing anything then. I'd suggest that future iterations
>> of these patches explicitly mention the deployment scenarios in which
>> this technology is able to offer increases security, and also describe
>> the scenarios where it will not improve things.
>
> Please correct me if I'm wrong here, but I don't understand how exactly
> whitelisting execve() is odd. The white list is inherit and passed along
> the child processes so they also need to have their own syscalls filtered
> by BPF in the kernel as stated in the Will's commit log[1] - "Filter
> programs will be inherited across fork/clone and execve." - I wonder if
> this is main point of your concern. Whitelisting execve() or not should be
> no difference from the security pov.

The helper might behave strangely (read: according to attacker's
wishes) when some of the required syscalls are not working.

> However, I agree that a possible future feature could a customized
> whitelist for each child process spawned. But for a first instance, the
> default whitelist should be enough to start seccomp support in Qemu.

It's a good way. I just wish the list weren't so open and lame to begin with.

>
> Also, as far as I understand, seccomp never meant to replace any of the
> technologies above mentioned. Using more than one layer of protection
> (SELinux, AppArmor MAC policy and/or DAC) should always be a good practice
> for the defense in depth.
>
> [1] -
> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commit;h=e2cfabdfd075648216f99c2c03821cf3f47c1727
>
> --
> Eduardo Otubo
> Software Engineer
> Linux Technology Center
> IBM Systems & Technology Group
> Mobile: +55 19 8135 0885
> eotubo@linux.vnet.ibm.com
>
Corey Bryant June 18, 2012, 9:53 p.m. UTC | #26
On 06/18/2012 04:18 PM, Blue Swirl wrote:
> On Mon, Jun 18, 2012 at 3:22 PM, Corey Bryant <coreyb@linux.vnet.ibm.com> wrote:
>>
>>
>> On 06/18/2012 04:33 AM, Daniel P. Berrange wrote:
>>>
>>> On Fri, Jun 15, 2012 at 07:04:45PM +0000, Blue Swirl wrote:
>>>>
>>>> On Wed, Jun 13, 2012 at 8:33 PM, Daniel P. Berrange <berrange@redhat.com>
>>>> wrote:
>>>>>
>>>>> On Wed, Jun 13, 2012 at 07:56:06PM +0000, Blue Swirl wrote:
>>>>>>
>>>>>> On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo
>>>>>> <otubo@linux.vnet.ibm.com> wrote:
>>>>>>>
>>>>>>> I added a syscall struct using priority levels as described in the
>>>>>>> libseccomp man page. The priority numbers are based to the frequency
>>>>>>> they appear in a sample strace from a regular qemu guest run under
>>>>>>> libvirt.
>>>>>>>
>>>>>>> Libseccomp generates linear BPF code to filter system calls, those
>>>>>>> rules
>>>>>>> are read one after another. The priority system places the most common
>>>>>>> rules first in order to reduce the overhead when processing them.
>>>>>>>
>>>>>>> Also, since this is just a first RFC, the whitelist is a little raw.
>>>>>>> We
>>>>>>> might need your help to improve, test and fine tune the set of system
>>>>>>> calls.
>>>>>>>
>>>>>>> v2: Fixed some style issues
>>>>>>>         Removed code from vl.c and created qemu-seccomp.[ch]
>>>>>>>         Now using ARRAY_SIZE macro
>>>>>>>         Added more syscalls without priority/frequency set yet
>>>>>>>
>>>>>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
>>>>>>> ---
>>>>>>>   qemu-seccomp.c |   73
>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>   qemu-seccomp.h |    9 +++++++
>>>>>>>   vl.c           |    7 ++++++
>>>>>>>   3 files changed, 89 insertions(+)
>>>>>>>   create mode 100644 qemu-seccomp.c
>>>>>>>   create mode 100644 qemu-seccomp.h
>>>>>>>
>>>>>>> diff --git a/qemu-seccomp.c b/qemu-seccomp.c
>>>>>>> new file mode 100644
>>>>>>> index 0000000..048b7ba
>>>>>>> --- /dev/null
>>>>>>> +++ b/qemu-seccomp.c
>>>>>>> @@ -0,0 +1,73 @@
>>>>>>
>>>>>>
>>>>>> Copyright and license info missing.
>>>>>>
>>>>>>> +#include <stdio.h>
>>>>>>> +#include <seccomp.h>
>>>>>>> +#include "qemu-seccomp.h"
>>>>>>> +
>>>>>>> +static struct QemuSeccompSyscall seccomp_whitelist[] = {
>>>>>>
>>>>>>
>>>>>> 'const'
>>>>>>
>>>>>>> +    { SCMP_SYS(timer_settime), 255 },
>>>>>>> +    { SCMP_SYS(timer_gettime), 254 },
>>>>>>> +    { SCMP_SYS(futex), 253 },
>>>>>>> +    { SCMP_SYS(select), 252 },
>>>>>>> +    { SCMP_SYS(recvfrom), 251 },
>>>>>>> +    { SCMP_SYS(sendto), 250 },
>>>>>>> +    { SCMP_SYS(read), 249 },
>>>>>>> +    { SCMP_SYS(brk), 248 },
>>>>>>> +    { SCMP_SYS(clone), 247 },
>>>>>>> +    { SCMP_SYS(mmap), 247 },
>>>>>>> +    { SCMP_SYS(mprotect), 246 },
>>>>>>> +    { SCMP_SYS(ioctl), 245 },
>>>>>>> +    { SCMP_SYS(recvmsg), 245 },
>>>>>>> +    { SCMP_SYS(sendmsg), 245 },
>>>>>>> +    { SCMP_SYS(accept), 245 },
>>>>>>> +    { SCMP_SYS(connect), 245 },
>>>>>>> +    { SCMP_SYS(bind), 245 },
>>>>>>
>>>>>>
>>>>>> It would be nice to avoid connect() and bind(). Perhaps seccomp init
>>>>>> should be postponed to after all sockets have been created?
>>>>>
>>>>>
>>>>> If you want to migrate your guest, you need to be able to
>>>>> call connect() at an arbitrary point in the QEMU process'
>>>>> lifecycle. So you can't avoid allowing connect(). Similarly
>>>>> if you want to allow hotplug of NICs (and their backends)
>>>>> then you need to have both bind() + connect() available.
>>>>
>>>>
>>>> That's bad. Migration could conceivably be extended to use file
>>>> descriptor passing, but hotplug is more tricky.
>>>
>>>
>>> As with execve(), i'm reporting this on the basis that on the previous
>>> patch posting I was told we must whitelist any syscalls QEMU can
>>> conceivably use to avoid any loss in functionality.
>>
>>
>> Thanks for pointing out syscalls needed for the whitelist.
>>
>> As Paul has already mentioned, it was recommended that we restrict all of
>> QEMU (as a single process) from the start of execution.  This is opposed to
>> other options of restricting QEMU from the time that vCPUS start, further
>> restricting based on syscall parms, or decomposing QEMU into multiple
>> processes that are individually restricted with their own seccomp
>> whitelists.
>
> Can each thread have separate seccomp whitelists? For example CPU
> threads should not need pretty much anything but the I/O thread needs
> I/O.
>

No, seccomp filters are defined and enforced at the process level.
Daniel P. Berrangé June 19, 2012, 9:23 a.m. UTC | #27
On Mon, Jun 18, 2012 at 08:15:37PM +0000, Blue Swirl wrote:
> On Mon, Jun 18, 2012 at 8:31 AM, Daniel P. Berrange <berrange@redhat.com> wrote:
> > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote:
> >> On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
> >> > I think allowing execve() would render seccomp pretty much useless.
> >>
> >> Not necessarily.
> >>
> >> I'll agree that it does seem a bit odd to allow execve(), but there is still
> >> value in enabling seccomp to disable potentially buggy/exploitable syscalls.
> >> Let's not forget that we have over 300 syscalls on x86_64, not including the
> >> 32 bit versions, and even if we add all of the new syscalls suggested in this
> >> thread we are still talking about a small subset of syscalls.  As far as
> >> security goes, the old adage of "less is more" applies.
> >
> > I can sort of see this argument, but *only* if the QEMU process is being
> > run under a dedicated, fully unprivileged (from a DAC pov) user, completely
> > separate from anything else on the system.
> >
> > If QEMU were being run as root, then even with seccomp, it could trivially
> > just overwrite some binary in /bin, update /proc/core-pattern to point to
> 
> Not wiithout 'open'. When run as root, it would be nice to chroot()
> also to some empty directory and then drop chroot() privileges.

That's just another example of my point, that adding seccomp alone
does nothing for QEMU security. It is only valuable when combined
with another security technique, be it per-user DAC separation,
SELinux MAC, or chroot, or splitting QEMU into multiple separate
processes, or using Linux containers to confine it, etc


Daniel
Avi Kivity June 19, 2012, 11:04 a.m. UTC | #28
On 06/16/2012 09:46 AM, Blue Swirl wrote:
> On Fri, Jun 15, 2012 at 9:36 PM, Paul Moore <pmoore@redhat.com> wrote:
>> On Friday, June 15, 2012 09:23:46 PM Blue Swirl wrote:
>>> On Fri, Jun 15, 2012 at 9:02 PM, Paul Moore <pmoore@redhat.com> wrote:
>>> > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
>>> >> I think allowing execve() would render seccomp pretty much useless.
>>> >
>>> > Not necessarily.
>>> >
>>> > I'll agree that it does seem a bit odd to allow execve(), but there is
>>> > still value in enabling seccomp to disable potentially buggy/exploitable
>>> > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not
>>> > including the 32 bit versions, and even if we add all of the new syscalls
>>> > suggested in this thread we are still talking about a small subset of
>>> > syscalls.  As far as security goes, the old adage of "less is more"
>>> > applies.
>>>
>>> The helper program being executed could need any of the 300 system
>>> calls, so we'd have to allow all.
>>
>> Don't we have some basic understanding of what the applications being exec'd
>> will need to do?  I sorta see your point, but allowing the entire set of
>> syscalls seems a bit dramatic.
> 
> At least qemu-ifup/down scripts, migration exec and smbd have been
> mentioned. Only the system calls made by smbd (for some version of it)
> can be known. The user could specify arbitrary commands for the
> others, those could be assumed to use some common (large) subset of
> system calls but I think the security value would be close to zero
> then.

We're not trying to protect against the user, but against the guest.  If
we assume the user wrote those scripts with care so they cannot be
exploited by the guest, then we are okay.

However I agree with you that it would be better to restrict those
syscalls.  The scripts are already unnecessary if using a management
system and migration supports passed file descriptors, so that leaves
only smbd, which can probably be pre-execed.

> 
>>
>>> > Protecting against the abuse and misuse of execve() is something that is
>>> > better done with the host's access controls (traditional DAC, MAC via the
>>> > LSM, etc.).
>>>
>>> How about seccomp mode selected by command line switch -seccomp, in
>>> which bind/connect/open/execve are forbidden? The functionality
>>> remaining would be somewhat limited (can't migrate or use SMB etc.
>>> until refactoring of QEMU), but that way seccomp jail would be much
>>> tighter.
>>
>> When I spoke to Anthony about this earlier (offline, sorry) he was opposed to
>> requiring any switches or user interaction to enable seccomp.  I'm not sure if
>> his stance on this has changed any over the past few months.
> 
> There could be two modes, strict mode (-seccomp) and default mode
> (only some syscalls blocked). With the future decomposed QEMU, strict
> seccomp mode would be default and the switch would be obsoleted. If
> the decomposition is planned to happen soonish, adding the switch
> would be just churn.

We have decomposed qemu to some extent, in that privileged operations
happen in libvirt.  So the modes make sense - qemu has no idea whether a
privileged management system is controlling it or not.

> 
>>
>> In my perfect world, we would have a decomposed QEMU that functions as a
>> series of processes connected via some sort of IPC; the exact divisions are a
>> bit TBD and beyond the scope of this discussion.  In this scenario we would be
>> able to restrict QEMU with sVirt and seccomp to a much higher degree than we
>> could with the current monolithic QEMU.
>>
>> I don't expect to see my perfect world any time soon, but in the meantime we
>> can still improve the security of QEMU on Linux with these seccomp patches and
>> for that reason I think it's a win.  Since these patches don't expose anything
>> at runtime (no knobs, switches, etc.) we leave ourselves plenty of flexibility
>> for changing things in the future.
> 
> Yes, I'm much in favor of adding seccomp support soon. But I just
> wonder if this is really the best level of security we can reach now,
> not assuming decomposed QEMU, but just minor tweaks?

We might disable mprotect(PROT_EXEC) if running with kvm.
Corey Bryant June 19, 2012, 4:51 p.m. UTC | #29
On 06/19/2012 11:37 AM, Will Drewry wrote:
> On Tue, Jun 19, 2012 at 8:35 AM, Corey Bryant <coreyb@linux.vnet.ibm.com> wrote:
>>
>>
>> On 06/18/2012 06:14 PM, Will Drewry wrote:
>>>
>>> [-all]
>>>
>>> On Mon, Jun 18, 2012 at 4:53 PM, Corey Bryant <coreyb@linux.vnet.ibm.com>
>>> wrote:
>>>>
>>>>
>>>>
>>>> On 06/18/2012 04:18 PM, Blue Swirl wrote:
>>>>>
>>>>>
>>>>> On Mon, Jun 18, 2012 at 3:22 PM, Corey Bryant
>>>>> <coreyb@linux.vnet.ibm.com>
>>>>> wrote:
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 06/18/2012 04:33 AM, Daniel P. Berrange wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On Fri, Jun 15, 2012 at 07:04:45PM +0000, Blue Swirl wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> On Wed, Jun 13, 2012 at 8:33 PM, Daniel P. Berrange
>>>>>>>> <berrange@redhat.com>
>>>>>>>> wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On Wed, Jun 13, 2012 at 07:56:06PM +0000, Blue Swirl wrote:
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> On Wed, Jun 13, 2012 at 7:20 PM, Eduardo Otubo
>>>>>>>>>> <otubo@linux.vnet.ibm.com> wrote:
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> I added a syscall struct using priority levels as described in the
>>>>>>>>>>> libseccomp man page. The priority numbers are based to the
>>>>>>>>>>> frequency
>>>>>>>>>>> they appear in a sample strace from a regular qemu guest run under
>>>>>>>>>>> libvirt.
>>>>>>>>>>>
>>>>>>>>>>> Libseccomp generates linear BPF code to filter system calls, those
>>>>>>>>>>> rules
>>>>>>>>>>> are read one after another. The priority system places the most
>>>>>>>>>>> common
>>>>>>>>>>> rules first in order to reduce the overhead when processing them.
>>>>>>>>>>>
>>>>>>>>>>> Also, since this is just a first RFC, the whitelist is a little
>>>>>>>>>>> raw.
>>>>>>>>>>> We
>>>>>>>>>>> might need your help to improve, test and fine tune the set of
>>>>>>>>>>> system
>>>>>>>>>>> calls.
>>>>>>>>>>>
>>>>>>>>>>> v2: Fixed some style issues
>>>>>>>>>>>         Removed code from vl.c and created qemu-seccomp.[ch]
>>>>>>>>>>>         Now using ARRAY_SIZE macro
>>>>>>>>>>>         Added more syscalls without priority/frequency set yet
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
>>>>>>>>>>> ---
>>>>>>>>>>>   qemu-seccomp.c |   73
>>>>>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>>>>   qemu-seccomp.h |    9 +++++++
>>>>>>>>>>>   vl.c           |    7 ++++++
>>>>>>>>>>>   3 files changed, 89 insertions(+)
>>>>>>>>>>>   create mode 100644 qemu-seccomp.c
>>>>>>>>>>>   create mode 100644 qemu-seccomp.h
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/qemu-seccomp.c b/qemu-seccomp.c
>>>>>>>>>>> new file mode 100644
>>>>>>>>>>> index 0000000..048b7ba
>>>>>>>>>>> --- /dev/null
>>>>>>>>>>> +++ b/qemu-seccomp.c
>>>>>>>>>>> @@ -0,0 +1,73 @@
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Copyright and license info missing.
>>>>>>>>>>
>>>>>>>>>>> +#include <stdio.h>
>>>>>>>>>>> +#include <seccomp.h>
>>>>>>>>>>> +#include "qemu-seccomp.h"
>>>>>>>>>>> +
>>>>>>>>>>> +static struct QemuSeccompSyscall seccomp_whitelist[] = {
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> 'const'
>>>>>>>>>>
>>>>>>>>>>> +    { SCMP_SYS(timer_settime), 255 },
>>>>>>>>>>> +    { SCMP_SYS(timer_gettime), 254 },
>>>>>>>>>>> +    { SCMP_SYS(futex), 253 },
>>>>>>>>>>> +    { SCMP_SYS(select), 252 },
>>>>>>>>>>> +    { SCMP_SYS(recvfrom), 251 },
>>>>>>>>>>> +    { SCMP_SYS(sendto), 250 },
>>>>>>>>>>> +    { SCMP_SYS(read), 249 },
>>>>>>>>>>> +    { SCMP_SYS(brk), 248 },
>>>>>>>>>>> +    { SCMP_SYS(clone), 247 },
>>>>>>>>>>> +    { SCMP_SYS(mmap), 247 },
>>>>>>>>>>> +    { SCMP_SYS(mprotect), 246 },
>>>>>>>>>>> +    { SCMP_SYS(ioctl), 245 },
>>>>>>>>>>> +    { SCMP_SYS(recvmsg), 245 },
>>>>>>>>>>> +    { SCMP_SYS(sendmsg), 245 },
>>>>>>>>>>> +    { SCMP_SYS(accept), 245 },
>>>>>>>>>>> +    { SCMP_SYS(connect), 245 },
>>>>>>>>>>> +    { SCMP_SYS(bind), 245 },
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> It would be nice to avoid connect() and bind(). Perhaps seccomp
>>>>>>>>>> init
>>>>>>>>>> should be postponed to after all sockets have been created?
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> If you want to migrate your guest, you need to be able to
>>>>>>>>> call connect() at an arbitrary point in the QEMU process'
>>>>>>>>> lifecycle. So you can't avoid allowing connect(). Similarly
>>>>>>>>> if you want to allow hotplug of NICs (and their backends)
>>>>>>>>> then you need to have both bind() + connect() available.
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> That's bad. Migration could conceivably be extended to use file
>>>>>>>> descriptor passing, but hotplug is more tricky.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> As with execve(), i'm reporting this on the basis that on the previous
>>>>>>> patch posting I was told we must whitelist any syscalls QEMU can
>>>>>>> conceivably use to avoid any loss in functionality.
>>>>>>
>>>>>>
>>>>>>
>>>>>>
>>>>>> Thanks for pointing out syscalls needed for the whitelist.
>>>>>>
>>>>>> As Paul has already mentioned, it was recommended that we restrict all
>>>>>> of
>>>>>> QEMU (as a single process) from the start of execution.  This is
>>>>>> opposed
>>>>>> to
>>>>>> other options of restricting QEMU from the time that vCPUS start,
>>>>>> further
>>>>>> restricting based on syscall parms, or decomposing QEMU into multiple
>>>>>> processes that are individually restricted with their own seccomp
>>>>>> whitelists.
>>>>>
>>>>>
>>>>>
>>>>> Can each thread have separate seccomp whitelists? For example CPU
>>>>> threads should not need pretty much anything but the I/O thread needs
>>>>> I/O.
>>>>>
>>>>
>>>> No, seccomp filters are defined and enforced at the process level.
>>>
>>>
>>> I'll keep lurking :) especially since I don't know the internals of
>>> qemu well, but you can do per-thread seccomp filters since
>>> processes==threads on linux. The real risk is that threads share so
>>> much that an attack on the CPU thread may be able to parlay that into
>>> a syscall proxy on a another thread.  Probably what would make sense
>>> in that way is a loose global filter, then have each sub-thread
>>> install a functionality specific second filter.
>>>
>>> I may be way off base though, so feel free to just tell me to keep lurking
>>> :)
>>>
>>> Thanks again for all the support and for pushing hard to get this
>>> functionality in qemu!
>>
>>
>> Please keep lurking!  I appreciate the input and education.  :)
>>
>> So whether it's a thread or process, I assume it will have its own a
>> task_struct, allowing us to set a filter per thread or per process.  The
>> difference being that threads share more resources than processes.  Sort of
>> thinking out loud here to see if I'm right.
>
> Exactly!
>
>> It doesn't seem ideal vs process separation, but it's do-able.
>
> Yep -- so for something like qemu, you could install a global baseline
> policy (e.g., union of all needed syscalls) then for each thread, they
> can install a more restrictive set.  The actual security guarantees
> will be the total synthesis because of cross-thread attacks, but it
> would make exploitation pretty painful.
>
> If you want better guarantees, then process separation is needed.  One
> option is even doing brokering for complex syscalls using either
> ptrace or a sigsys handler, but that is likely too much to get into
> while establishing a baseline.
>

In response to "Can each thread have separate seccomp whitelists?" 
please take a look at the thread above from Will Drewry.  seccomp *can* 
be used per thread.  However, it's not ideal vs per process seccomp filters.
Blue Swirl June 19, 2012, 6:44 p.m. UTC | #30
On Tue, Jun 19, 2012 at 9:23 AM, Daniel P. Berrange <berrange@redhat.com> wrote:
> On Mon, Jun 18, 2012 at 08:15:37PM +0000, Blue Swirl wrote:
>> On Mon, Jun 18, 2012 at 8:31 AM, Daniel P. Berrange <berrange@redhat.com> wrote:
>> > On Fri, Jun 15, 2012 at 05:02:19PM -0400, Paul Moore wrote:
>> >> On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
>> >> > I think allowing execve() would render seccomp pretty much useless.
>> >>
>> >> Not necessarily.
>> >>
>> >> I'll agree that it does seem a bit odd to allow execve(), but there is still
>> >> value in enabling seccomp to disable potentially buggy/exploitable syscalls.
>> >> Let's not forget that we have over 300 syscalls on x86_64, not including the
>> >> 32 bit versions, and even if we add all of the new syscalls suggested in this
>> >> thread we are still talking about a small subset of syscalls.  As far as
>> >> security goes, the old adage of "less is more" applies.
>> >
>> > I can sort of see this argument, but *only* if the QEMU process is being
>> > run under a dedicated, fully unprivileged (from a DAC pov) user, completely
>> > separate from anything else on the system.
>> >
>> > If QEMU were being run as root, then even with seccomp, it could trivially
>> > just overwrite some binary in /bin, update /proc/core-pattern to point to
>>
>> Not wiithout 'open'. When run as root, it would be nice to chroot()
>> also to some empty directory and then drop chroot() privileges.
>
> That's just another example of my point, that adding seccomp alone
> does nothing for QEMU security. It is only valuable when combined
> with another security technique, be it per-user DAC separation,
> SELinux MAC, or chroot, or splitting QEMU into multiple separate
> processes, or using Linux containers to confine it, etc

I think seccomp with some changes to either functionality or with
minor refactoring could do a lot more than with just 'accept all'
white list. But as a starting point, this is sort of OK.

>
>
> Daniel
> --
> |: http://berrange.com      -o-    http://www.flickr.com/photos/dberrange/ :|
> |: http://libvirt.org              -o-             http://virt-manager.org :|
> |: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
> |: http://entangle-photo.org       -o-       http://live.gnome.org/gtk-vnc :|
Blue Swirl June 19, 2012, 6:58 p.m. UTC | #31
On Tue, Jun 19, 2012 at 11:04 AM, Avi Kivity <avi@redhat.com> wrote:
> On 06/16/2012 09:46 AM, Blue Swirl wrote:
>> On Fri, Jun 15, 2012 at 9:36 PM, Paul Moore <pmoore@redhat.com> wrote:
>>> On Friday, June 15, 2012 09:23:46 PM Blue Swirl wrote:
>>>> On Fri, Jun 15, 2012 at 9:02 PM, Paul Moore <pmoore@redhat.com> wrote:
>>>> > On Friday, June 15, 2012 07:06:10 PM Blue Swirl wrote:
>>>> >> I think allowing execve() would render seccomp pretty much useless.
>>>> >
>>>> > Not necessarily.
>>>> >
>>>> > I'll agree that it does seem a bit odd to allow execve(), but there is
>>>> > still value in enabling seccomp to disable potentially buggy/exploitable
>>>> > syscalls. Let's not forget that we have over 300 syscalls on x86_64, not
>>>> > including the 32 bit versions, and even if we add all of the new syscalls
>>>> > suggested in this thread we are still talking about a small subset of
>>>> > syscalls.  As far as security goes, the old adage of "less is more"
>>>> > applies.
>>>>
>>>> The helper program being executed could need any of the 300 system
>>>> calls, so we'd have to allow all.
>>>
>>> Don't we have some basic understanding of what the applications being exec'd
>>> will need to do?  I sorta see your point, but allowing the entire set of
>>> syscalls seems a bit dramatic.
>>
>> At least qemu-ifup/down scripts, migration exec and smbd have been
>> mentioned. Only the system calls made by smbd (for some version of it)
>> can be known. The user could specify arbitrary commands for the
>> others, those could be assumed to use some common (large) subset of
>> system calls but I think the security value would be close to zero
>> then.
>
> We're not trying to protect against the user, but against the guest.  If
> we assume the user wrote those scripts with care so they cannot be
> exploited by the guest, then we are okay.

My concern was that first we could accidentally filter a system call
that changes the script or executable behavior, much like sendmail +
capabilities bug, and then a guest could trigger running this
script/executable and exploit the changed behavior.

>
> However I agree with you that it would be better to restrict those
> syscalls.  The scripts are already unnecessary if using a management
> system and migration supports passed file descriptors, so that leaves
> only smbd, which can probably be pre-execed.

File descriptor passing could also work for smbd.

>
>>
>>>
>>>> > Protecting against the abuse and misuse of execve() is something that is
>>>> > better done with the host's access controls (traditional DAC, MAC via the
>>>> > LSM, etc.).
>>>>
>>>> How about seccomp mode selected by command line switch -seccomp, in
>>>> which bind/connect/open/execve are forbidden? The functionality
>>>> remaining would be somewhat limited (can't migrate or use SMB etc.
>>>> until refactoring of QEMU), but that way seccomp jail would be much
>>>> tighter.
>>>
>>> When I spoke to Anthony about this earlier (offline, sorry) he was opposed to
>>> requiring any switches or user interaction to enable seccomp.  I'm not sure if
>>> his stance on this has changed any over the past few months.
>>
>> There could be two modes, strict mode (-seccomp) and default mode
>> (only some syscalls blocked). With the future decomposed QEMU, strict
>> seccomp mode would be default and the switch would be obsoleted. If
>> the decomposition is planned to happen soonish, adding the switch
>> would be just churn.
>
> We have decomposed qemu to some extent, in that privileged operations
> happen in libvirt.  So the modes make sense - qemu has no idea whether a
> privileged management system is controlling it or not.

So with -seccomp, libvirt could tell QEMU that for example open(),
execve(), bind() and connect() will never be needed?

>
>>
>>>
>>> In my perfect world, we would have a decomposed QEMU that functions as a
>>> series of processes connected via some sort of IPC; the exact divisions are a
>>> bit TBD and beyond the scope of this discussion.  In this scenario we would be
>>> able to restrict QEMU with sVirt and seccomp to a much higher degree than we
>>> could with the current monolithic QEMU.
>>>
>>> I don't expect to see my perfect world any time soon, but in the meantime we
>>> can still improve the security of QEMU on Linux with these seccomp patches and
>>> for that reason I think it's a win.  Since these patches don't expose anything
>>> at runtime (no knobs, switches, etc.) we leave ourselves plenty of flexibility
>>> for changing things in the future.
>>
>> Yes, I'm much in favor of adding seccomp support soon. But I just
>> wonder if this is really the best level of security we can reach now,
>> not assuming decomposed QEMU, but just minor tweaks?
>
> We might disable mprotect(PROT_EXEC) if running with kvm.
>
> --
> error compiling committee.c: too many arguments to function
>
>
Avi Kivity June 21, 2012, 8:04 a.m. UTC | #32
On 06/19/2012 09:58 PM, Blue Swirl wrote:
>>> At least qemu-ifup/down scripts, migration exec and smbd have been
>>> mentioned. Only the system calls made by smbd (for some version of it)
>>> can be known. The user could specify arbitrary commands for the
>>> others, those could be assumed to use some common (large) subset of
>>> system calls but I think the security value would be close to zero
>>> then.
>>
>> We're not trying to protect against the user, but against the guest.  If
>> we assume the user wrote those scripts with care so they cannot be
>> exploited by the guest, then we are okay.
> 
> My concern was that first we could accidentally filter a system call
> that changes the script or executable behavior, much like sendmail +
> capabilities bug, and then a guest could trigger running this
> script/executable and exploit the changed behavior.

Ah, I see.  I agree this is dangerous.  We should probably disable exec
if we seccomp.

>>
>> We have decomposed qemu to some extent, in that privileged operations
>> happen in libvirt.  So the modes make sense - qemu has no idea whether a
>> privileged management system is controlling it or not.
> 
> So with -seccomp, libvirt could tell QEMU that for example open(),
> execve(), bind() and connect() will never be needed?

Yes.
Anthony Liguori June 27, 2012, 9:25 p.m. UTC | #33
On 06/21/2012 03:04 AM, Avi Kivity wrote:
> On 06/19/2012 09:58 PM, Blue Swirl wrote:
>>>> At least qemu-ifup/down scripts, migration exec and smbd have been
>>>> mentioned. Only the system calls made by smbd (for some version of it)
>>>> can be known. The user could specify arbitrary commands for the
>>>> others, those could be assumed to use some common (large) subset of
>>>> system calls but I think the security value would be close to zero
>>>> then.
>>>
>>> We're not trying to protect against the user, but against the guest.  If
>>> we assume the user wrote those scripts with care so they cannot be
>>> exploited by the guest, then we are okay.
>>
>> My concern was that first we could accidentally filter a system call
>> that changes the script or executable behavior, much like sendmail +
>> capabilities bug, and then a guest could trigger running this
>> script/executable and exploit the changed behavior.
>
> Ah, I see.  I agree this is dangerous.  We should probably disable exec
> if we seccomp.

There's no great place to jump into this thread so I guess I'll do it here.

There is absolutely no doubt that white-listing syscalls that we currently use 
provides an improvement in security.

We need to assume:

1) QEMU is run as an unprivileged user

2) QEMU is already heavily restricted by SELinux

In this case, seccomp() is not being used to replace MAC or DAC.  It's 
supplementing both of them by additionally filtering out syscalls that may have 
unknown kernel exploits in them.  That's all this initial effort is about. 
Since it's scope is so limited, we can simply enable it unconditionally too.

After we have this initial support, then we can look at a -sandbox option.  This 
open could prevent things like open()/execve() but that will come at a cost of 
features.

I think the reasonable thing to do for -sandbox is to basically focus on the set 
of syscalls that QEMU would use if it were launched under libvirt.  We should 
obviously make improvements (things like -blockdev) to make this even more 
restrictive.

Who knows, maybe we end up having multiple types of sandboxes.  A '-sandbox 
libvirt' and a '-sandbox user' where the later is focused on the typical usage 
of an unprivileged user.

But this is all stuff that can come later.  We solve a big problem by just 
getting the initial whitelist support in.

Regards,

Anthony Liguori

>
>>>
>>> We have decomposed qemu to some extent, in that privileged operations
>>> happen in libvirt.  So the modes make sense - qemu has no idea whether a
>>> privileged management system is controlling it or not.
>>
>> So with -seccomp, libvirt could tell QEMU that for example open(),
>> execve(), bind() and connect() will never be needed?
>
> Yes.
>
Blue Swirl June 28, 2012, 7:49 p.m. UTC | #34
On Wed, Jun 27, 2012 at 9:25 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
> On 06/21/2012 03:04 AM, Avi Kivity wrote:
>>
>> On 06/19/2012 09:58 PM, Blue Swirl wrote:
>>>>>
>>>>> At least qemu-ifup/down scripts, migration exec and smbd have been
>>>>> mentioned. Only the system calls made by smbd (for some version of it)
>>>>> can be known. The user could specify arbitrary commands for the
>>>>> others, those could be assumed to use some common (large) subset of
>>>>> system calls but I think the security value would be close to zero
>>>>> then.
>>>>
>>>>
>>>> We're not trying to protect against the user, but against the guest.  If
>>>> we assume the user wrote those scripts with care so they cannot be
>>>> exploited by the guest, then we are okay.
>>>
>>>
>>> My concern was that first we could accidentally filter a system call
>>> that changes the script or executable behavior, much like sendmail +
>>> capabilities bug, and then a guest could trigger running this
>>> script/executable and exploit the changed behavior.
>>
>>
>> Ah, I see.  I agree this is dangerous.  We should probably disable exec
>> if we seccomp.
>
>
> There's no great place to jump into this thread so I guess I'll do it here.
>
> There is absolutely no doubt that white-listing syscalls that we currently
> use provides an improvement in security.
>
> We need to assume:
>
> 1) QEMU is run as an unprivileged user
>
> 2) QEMU is already heavily restricted by SELinux
>
> In this case, seccomp() is not being used to replace MAC or DAC.  It's
> supplementing both of them by additionally filtering out syscalls that may
> have unknown kernel exploits in them.  That's all this initial effort is
> about. Since it's scope is so limited, we can simply enable it
> unconditionally too.

I don't think the scope is limited in a safe way. What is the set of
system calls that can't ever cause problems to any possible ifup/down
scripts, migration exec helpers and various versions of smbd?

For example, unlink() is missing. What if the ifup/down script needs
it for lock file cleanup? ftruncate()? Every socket syscalls in case
LDAP is used to access user information by the libc?

I think we can't define the safe set, except 'allow all'. I'd propose
one of the following to avoid breakage:

1. Allow all system calls for the initial patch, refactor later to
reduce the set. Useless until refactored.

2. Don't make seccomp mode enabled default, when enabled, forbid
execve(). Limits functionality when enabled, no security benefit if
not enabled.

3. Before enabling seccomp, fork a helper process without restrictions
that is used to launch other programs. Needs some work.

>
> After we have this initial support, then we can look at a -sandbox option.
>  This open could prevent things like open()/execve() but that will come at a
> cost of features.
>
> I think the reasonable thing to do for -sandbox is to basically focus on the
> set of syscalls that QEMU would use if it were launched under libvirt.  We
> should obviously make improvements (things like -blockdev) to make this even
> more restrictive.
>
> Who knows, maybe we end up having multiple types of sandboxes.  A '-sandbox
> libvirt' and a '-sandbox user' where the later is focused on the typical
> usage of an unprivileged user.
>
> But this is all stuff that can come later.  We solve a big problem by just
> getting the initial whitelist support in.

Fully agree, but we'd have to agree about what is a safe initial whitelist.

>
> Regards,
>
> Anthony Liguori
>
>
>>
>>>>
>>>> We have decomposed qemu to some extent, in that privileged operations
>>>> happen in libvirt.  So the modes make sense - qemu has no idea whether a
>>>> privileged management system is controlling it or not.
>>>
>>>
>>> So with -seccomp, libvirt could tell QEMU that for example open(),
>>> execve(), bind() and connect() will never be needed?
>>
>>
>> Yes.
>>
>
Corey Bryant June 29, 2012, 3:27 p.m. UTC | #35
On 06/28/2012 03:49 PM, Blue Swirl wrote:
> On Wed, Jun 27, 2012 at 9:25 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
>> On 06/21/2012 03:04 AM, Avi Kivity wrote:
>>>
>>> On 06/19/2012 09:58 PM, Blue Swirl wrote:
>>>>>>
>>>>>> At least qemu-ifup/down scripts, migration exec and smbd have been
>>>>>> mentioned. Only the system calls made by smbd (for some version of it)
>>>>>> can be known. The user could specify arbitrary commands for the
>>>>>> others, those could be assumed to use some common (large) subset of
>>>>>> system calls but I think the security value would be close to zero
>>>>>> then.
>>>>>
>>>>>
>>>>> We're not trying to protect against the user, but against the guest.  If
>>>>> we assume the user wrote those scripts with care so they cannot be
>>>>> exploited by the guest, then we are okay.
>>>>
>>>>
>>>> My concern was that first we could accidentally filter a system call
>>>> that changes the script or executable behavior, much like sendmail +
>>>> capabilities bug, and then a guest could trigger running this
>>>> script/executable and exploit the changed behavior.
>>>
>>>
>>> Ah, I see.  I agree this is dangerous.  We should probably disable exec
>>> if we seccomp.
>>
>>
>> There's no great place to jump into this thread so I guess I'll do it here.
>>
>> There is absolutely no doubt that white-listing syscalls that we currently
>> use provides an improvement in security.
>>
>> We need to assume:
>>
>> 1) QEMU is run as an unprivileged user
>>
>> 2) QEMU is already heavily restricted by SELinux
>>
>> In this case, seccomp() is not being used to replace MAC or DAC.  It's
>> supplementing both of them by additionally filtering out syscalls that may
>> have unknown kernel exploits in them.  That's all this initial effort is
>> about. Since it's scope is so limited, we can simply enable it
>> unconditionally too.
>
> I don't think the scope is limited in a safe way. What is the set of
> system calls that can't ever cause problems to any possible ifup/down
> scripts, migration exec helpers and various versions of smbd?
>
> For example, unlink() is missing. What if the ifup/down script needs
> it for lock file cleanup? ftruncate()? Every socket syscalls in case
> LDAP is used to access user information by the libc?
>
> I think we can't define the safe set, except 'allow all'. I'd propose
> one of the following to avoid breakage:
>
> 1. Allow all system calls for the initial patch, refactor later to
> reduce the set. Useless until refactored.
>
> 2. Don't make seccomp mode enabled default, when enabled, forbid
> execve(). Limits functionality when enabled, no security benefit if
> not enabled.

It should be noted that PR_SET_NO_NEW_PRIVS is set by default when the 
seccomp filter is enabled by libseccomp.  This prevents any new 
privileges from being granted on execve.

>
> 3. Before enabling seccomp, fork a helper process without restrictions
> that is used to launch other programs. Needs some work.
>
>>
>> After we have this initial support, then we can look at a -sandbox option.
>>   This open could prevent things like open()/execve() but that will come at a
>> cost of features.
>>
>> I think the reasonable thing to do for -sandbox is to basically focus on the
>> set of syscalls that QEMU would use if it were launched under libvirt.  We
>> should obviously make improvements (things like -blockdev) to make this even
>> more restrictive.
>>
>> Who knows, maybe we end up having multiple types of sandboxes.  A '-sandbox
>> libvirt' and a '-sandbox user' where the later is focused on the typical
>> usage of an unprivileged user.
>>
>> But this is all stuff that can come later.  We solve a big problem by just
>> getting the initial whitelist support in.
>
> Fully agree, but we'd have to agree about what is a safe initial whitelist.
>
>>
>> Regards,
>>
>> Anthony Liguori
>>
>>
>>>
>>>>>
>>>>> We have decomposed qemu to some extent, in that privileged operations
>>>>> happen in libvirt.  So the modes make sense - qemu has no idea whether a
>>>>> privileged management system is controlling it or not.
>>>>
>>>>
>>>> So with -seccomp, libvirt could tell QEMU that for example open(),
>>>> execve(), bind() and connect() will never be needed?
>>>
>>>
>>> Yes.
>>>
>>
>
Blue Swirl June 29, 2012, 8 p.m. UTC | #36
On Fri, Jun 29, 2012 at 3:27 PM, Corey Bryant <coreyb@linux.vnet.ibm.com> wrote:
>
>
> On 06/28/2012 03:49 PM, Blue Swirl wrote:
>>
>> On Wed, Jun 27, 2012 at 9:25 PM, Anthony Liguori <anthony@codemonkey.ws>
>> wrote:
>>>
>>> On 06/21/2012 03:04 AM, Avi Kivity wrote:
>>>>
>>>>
>>>> On 06/19/2012 09:58 PM, Blue Swirl wrote:
>>>>>>>
>>>>>>>
>>>>>>> At least qemu-ifup/down scripts, migration exec and smbd have been
>>>>>>> mentioned. Only the system calls made by smbd (for some version of
>>>>>>> it)
>>>>>>> can be known. The user could specify arbitrary commands for the
>>>>>>> others, those could be assumed to use some common (large) subset of
>>>>>>> system calls but I think the security value would be close to zero
>>>>>>> then.
>>>>>>
>>>>>>
>>>>>>
>>>>>> We're not trying to protect against the user, but against the guest.
>>>>>>  If
>>>>>> we assume the user wrote those scripts with care so they cannot be
>>>>>> exploited by the guest, then we are okay.
>>>>>
>>>>>
>>>>>
>>>>> My concern was that first we could accidentally filter a system call
>>>>> that changes the script or executable behavior, much like sendmail +
>>>>> capabilities bug, and then a guest could trigger running this
>>>>> script/executable and exploit the changed behavior.
>>>>
>>>>
>>>>
>>>> Ah, I see.  I agree this is dangerous.  We should probably disable exec
>>>> if we seccomp.
>>>
>>>
>>>
>>> There's no great place to jump into this thread so I guess I'll do it
>>> here.
>>>
>>> There is absolutely no doubt that white-listing syscalls that we
>>> currently
>>> use provides an improvement in security.
>>>
>>> We need to assume:
>>>
>>> 1) QEMU is run as an unprivileged user
>>>
>>> 2) QEMU is already heavily restricted by SELinux
>>>
>>> In this case, seccomp() is not being used to replace MAC or DAC.  It's
>>> supplementing both of them by additionally filtering out syscalls that
>>> may
>>> have unknown kernel exploits in them.  That's all this initial effort is
>>> about. Since it's scope is so limited, we can simply enable it
>>> unconditionally too.
>>
>>
>> I don't think the scope is limited in a safe way. What is the set of
>> system calls that can't ever cause problems to any possible ifup/down
>> scripts, migration exec helpers and various versions of smbd?
>>
>> For example, unlink() is missing. What if the ifup/down script needs
>> it for lock file cleanup? ftruncate()? Every socket syscalls in case
>> LDAP is used to access user information by the libc?
>>
>> I think we can't define the safe set, except 'allow all'. I'd propose
>> one of the following to avoid breakage:
>>
>> 1. Allow all system calls for the initial patch, refactor later to
>> reduce the set. Useless until refactored.
>>
>> 2. Don't make seccomp mode enabled default, when enabled, forbid
>> execve(). Limits functionality when enabled, no security benefit if
>> not enabled.
>
>
> It should be noted that PR_SET_NO_NEW_PRIVS is set by default when the
> seccomp filter is enabled by libseccomp.  This prevents any new privileges
> from being granted on execve.

This is probably getting very hypothetical, but what happens if the
ifup/down scripts need to run a setuid/gid helper or a helper with
additional privileges from file system capabilities?

>
>
>>
>> 3. Before enabling seccomp, fork a helper process without restrictions
>> that is used to launch other programs. Needs some work.
>>
>>>
>>> After we have this initial support, then we can look at a -sandbox
>>> option.
>>>  This open could prevent things like open()/execve() but that will come
>>> at a
>>> cost of features.
>>>
>>> I think the reasonable thing to do for -sandbox is to basically focus on
>>> the
>>> set of syscalls that QEMU would use if it were launched under libvirt.
>>>  We
>>> should obviously make improvements (things like -blockdev) to make this
>>> even
>>> more restrictive.
>>>
>>> Who knows, maybe we end up having multiple types of sandboxes.  A
>>> '-sandbox
>>> libvirt' and a '-sandbox user' where the later is focused on the typical
>>> usage of an unprivileged user.
>>>
>>> But this is all stuff that can come later.  We solve a big problem by
>>> just
>>> getting the initial whitelist support in.
>>
>>
>> Fully agree, but we'd have to agree about what is a safe initial
>> whitelist.
>>
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>
>>>
>>>>
>>>>>>
>>>>>> We have decomposed qemu to some extent, in that privileged operations
>>>>>> happen in libvirt.  So the modes make sense - qemu has no idea whether
>>>>>> a
>>>>>> privileged management system is controlling it or not.
>>>>>
>>>>>
>>>>>
>>>>> So with -seccomp, libvirt could tell QEMU that for example open(),
>>>>> execve(), bind() and connect() will never be needed?
>>>>
>>>>
>>>>
>>>> Yes.
>>>>
>>>
>>
>
> --
> Regards,
> Corey
>
>
Corey Bryant June 29, 2012, 8:36 p.m. UTC | #37
On 06/29/2012 04:00 PM, Blue Swirl wrote:
> On Fri, Jun 29, 2012 at 3:27 PM, Corey Bryant <coreyb@linux.vnet.ibm.com> wrote:
>>
>>
>> On 06/28/2012 03:49 PM, Blue Swirl wrote:
>>>
>>> On Wed, Jun 27, 2012 at 9:25 PM, Anthony Liguori <anthony@codemonkey.ws>
>>> wrote:
>>>>
>>>> On 06/21/2012 03:04 AM, Avi Kivity wrote:
>>>>>
>>>>>
>>>>> On 06/19/2012 09:58 PM, Blue Swirl wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> At least qemu-ifup/down scripts, migration exec and smbd have been
>>>>>>>> mentioned. Only the system calls made by smbd (for some version of
>>>>>>>> it)
>>>>>>>> can be known. The user could specify arbitrary commands for the
>>>>>>>> others, those could be assumed to use some common (large) subset of
>>>>>>>> system calls but I think the security value would be close to zero
>>>>>>>> then.
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> We're not trying to protect against the user, but against the guest.
>>>>>>>   If
>>>>>>> we assume the user wrote those scripts with care so they cannot be
>>>>>>> exploited by the guest, then we are okay.
>>>>>>
>>>>>>
>>>>>>
>>>>>> My concern was that first we could accidentally filter a system call
>>>>>> that changes the script or executable behavior, much like sendmail +
>>>>>> capabilities bug, and then a guest could trigger running this
>>>>>> script/executable and exploit the changed behavior.
>>>>>
>>>>>
>>>>>
>>>>> Ah, I see.  I agree this is dangerous.  We should probably disable exec
>>>>> if we seccomp.
>>>>
>>>>
>>>>
>>>> There's no great place to jump into this thread so I guess I'll do it
>>>> here.
>>>>
>>>> There is absolutely no doubt that white-listing syscalls that we
>>>> currently
>>>> use provides an improvement in security.
>>>>
>>>> We need to assume:
>>>>
>>>> 1) QEMU is run as an unprivileged user
>>>>
>>>> 2) QEMU is already heavily restricted by SELinux
>>>>
>>>> In this case, seccomp() is not being used to replace MAC or DAC.  It's
>>>> supplementing both of them by additionally filtering out syscalls that
>>>> may
>>>> have unknown kernel exploits in them.  That's all this initial effort is
>>>> about. Since it's scope is so limited, we can simply enable it
>>>> unconditionally too.
>>>
>>>
>>> I don't think the scope is limited in a safe way. What is the set of
>>> system calls that can't ever cause problems to any possible ifup/down
>>> scripts, migration exec helpers and various versions of smbd?
>>>
>>> For example, unlink() is missing. What if the ifup/down script needs
>>> it for lock file cleanup? ftruncate()? Every socket syscalls in case
>>> LDAP is used to access user information by the libc?
>>>
>>> I think we can't define the safe set, except 'allow all'. I'd propose
>>> one of the following to avoid breakage:
>>>
>>> 1. Allow all system calls for the initial patch, refactor later to
>>> reduce the set. Useless until refactored.
>>>
>>> 2. Don't make seccomp mode enabled default, when enabled, forbid
>>> execve(). Limits functionality when enabled, no security benefit if
>>> not enabled.
>>
>>
>> It should be noted that PR_SET_NO_NEW_PRIVS is set by default when the
>> seccomp filter is enabled by libseccomp.  This prevents any new privileges
>> from being granted on execve.
>
> This is probably getting very hypothetical, but what happens if the
> ifup/down scripts need to run a setuid/gid helper or a helper with
> additional privileges from file system capabilities?
>

I believe execve() will either fail or not grant the privileges.
Paolo Bonzini July 1, 2012, 1:25 p.m. UTC | #38
Il 18/06/2012 23:53, Corey Bryant ha scritto:
>>
>> Can each thread have separate seccomp whitelists? For example CPU
>> threads should not need pretty much anything but the I/O thread needs
>> I/O.
>>
> 
> No, seccomp filters are defined and enforced at the process level.

Perhaps we can add (at the kernel level) a way for seccomp filters to
examine the current tid.

Paolo
Will Drewry July 2, 2012, 2:18 a.m. UTC | #39
On Sun, Jul 1, 2012 at 8:25 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 18/06/2012 23:53, Corey Bryant ha scritto:
>>>
>>> Can each thread have separate seccomp whitelists? For example CPU
>>> threads should not need pretty much anything but the I/O thread needs
>>> I/O.
>>>
>>
>> No, seccomp filters are defined and enforced at the process level.
>
> Perhaps we can add (at the kernel level) a way for seccomp filters to
> examine the current tid.

seccomp filters are attached to the task_struct and apply per "thread"
or per process since they both get their own task_structs.  (For
Linux, process==thread with shared resources.)  Filter programs are
also inherited across clone/fork, so it's possible to install a
"global" filter program which applies which is inherited during thread
creation, then apply per-thread refinements by stacking on additional
filters (at the cost of additional evaluation time).

hth!
will
Corey Bryant July 2, 2012, 2:20 p.m. UTC | #40
On 07/01/2012 10:18 PM, Will Drewry wrote:
> On Sun, Jul 1, 2012 at 8:25 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>> Il 18/06/2012 23:53, Corey Bryant ha scritto:
>>>>
>>>> Can each thread have separate seccomp whitelists? For example CPU
>>>> threads should not need pretty much anything but the I/O thread needs
>>>> I/O.
>>>>
>>>
>>> No, seccomp filters are defined and enforced at the process level.
>>
>> Perhaps we can add (at the kernel level) a way for seccomp filters to
>> examine the current tid.

Sorry for the confusion.  I corrected my statement in a later thread 
based on Will's input: 
http://lists.nongnu.org/archive/html/qemu-devel/2012-06/msg03212.html

>
> seccomp filters are attached to the task_struct and apply per "thread"
> or per process since they both get their own task_structs.  (For
> Linux, process==thread with shared resources.)  Filter programs are
> also inherited across clone/fork, so it's possible to install a
> "global" filter program which applies which is inherited during thread
> creation, then apply per-thread refinements by stacking on additional
> filters (at the cost of additional evaluation time).
>
> hth!
> will
>

Thanks!
Corey Bryant July 2, 2012, 6:05 p.m. UTC | #41
On 06/28/2012 03:49 PM, Blue Swirl wrote:
> On Wed, Jun 27, 2012 at 9:25 PM, Anthony Liguori <anthony@codemonkey.ws> wrote:
>> On 06/21/2012 03:04 AM, Avi Kivity wrote:
>>>
>>> On 06/19/2012 09:58 PM, Blue Swirl wrote:
>>>>>>
>>>>>> At least qemu-ifup/down scripts, migration exec and smbd have been
>>>>>> mentioned. Only the system calls made by smbd (for some version of it)
>>>>>> can be known. The user could specify arbitrary commands for the
>>>>>> others, those could be assumed to use some common (large) subset of
>>>>>> system calls but I think the security value would be close to zero
>>>>>> then.
>>>>>
>>>>>
>>>>> We're not trying to protect against the user, but against the guest.  If
>>>>> we assume the user wrote those scripts with care so they cannot be
>>>>> exploited by the guest, then we are okay.
>>>>
>>>>
>>>> My concern was that first we could accidentally filter a system call
>>>> that changes the script or executable behavior, much like sendmail +
>>>> capabilities bug, and then a guest could trigger running this
>>>> script/executable and exploit the changed behavior.
>>>
>>>
>>> Ah, I see.  I agree this is dangerous.  We should probably disable exec
>>> if we seccomp.
>>
>>
>> There's no great place to jump into this thread so I guess I'll do it here.
>>
>> There is absolutely no doubt that white-listing syscalls that we currently
>> use provides an improvement in security.
>>
>> We need to assume:
>>
>> 1) QEMU is run as an unprivileged user
>>
>> 2) QEMU is already heavily restricted by SELinux
>>
>> In this case, seccomp() is not being used to replace MAC or DAC.  It's
>> supplementing both of them by additionally filtering out syscalls that may
>> have unknown kernel exploits in them.  That's all this initial effort is
>> about. Since it's scope is so limited, we can simply enable it
>> unconditionally too.
>
> I don't think the scope is limited in a safe way. What is the set of
> system calls that can't ever cause problems to any possible ifup/down
> scripts, migration exec helpers and various versions of smbd?
>
> For example, unlink() is missing. What if the ifup/down script needs
> it for lock file cleanup? ftruncate()? Every socket syscalls in case
> LDAP is used to access user information by the libc?
>
> I think we can't define the safe set, except 'allow all'. I'd propose
> one of the following to avoid breakage:
>
> 1. Allow all system calls for the initial patch, refactor later to
> reduce the set. Useless until refactored.

One thing I like about starting with a known subset of syscalls used by 
QEMU is that it forces us to expand the whitelist if we come across more 
syscalls that QEMU uses.

An issue with this approach is that if seccomp kills QEMU for using a 
disallowed syscall, I don't think we know what syscall it is.  (At 
least, I don't think it is accessible anywhere.)  This is good for 
security but makes it hard for developers who are debugging.

Would it make sense to have the ability to configure QEMU in either:
1) seccomp kill mode (this is what the existing patches do), or
2) seccomp debug mode?

In debug mode we could trap on the failing syscall (using 
SCMP_ACT_TRAP), determine the syscall value, and issue an error message 
that displays the syscall value.

The emulator() function here gives an idea of how this could be done: 
https://lkml.org/lkml/2012/4/12/449

>
> 2. Don't make seccomp mode enabled default, when enabled, forbid
> execve(). Limits functionality when enabled, no security benefit if
> not enabled.
>
> 3. Before enabling seccomp, fork a helper process without restrictions
> that is used to launch other programs. Needs some work.
>
>>
>> After we have this initial support, then we can look at a -sandbox option.
>>   This open could prevent things like open()/execve() but that will come at a
>> cost of features.
>>
>> I think the reasonable thing to do for -sandbox is to basically focus on the
>> set of syscalls that QEMU would use if it were launched under libvirt.  We
>> should obviously make improvements (things like -blockdev) to make this even
>> more restrictive.
>>
>> Who knows, maybe we end up having multiple types of sandboxes.  A '-sandbox
>> libvirt' and a '-sandbox user' where the later is focused on the typical
>> usage of an unprivileged user.
>>
>> But this is all stuff that can come later.  We solve a big problem by just
>> getting the initial whitelist support in.
>
> Fully agree, but we'd have to agree about what is a safe initial whitelist.
>
>>
>> Regards,
>>
>> Anthony Liguori
>>
>>
>>>
>>>>>
>>>>> We have decomposed qemu to some extent, in that privileged operations
>>>>> happen in libvirt.  So the modes make sense - qemu has no idea whether a
>>>>> privileged management system is controlling it or not.
>>>>
>>>>
>>>> So with -seccomp, libvirt could tell QEMU that for example open(),
>>>> execve(), bind() and connect() will never be needed?
>>>
>>>
>>> Yes.
>>>
>>
>
Blue Swirl July 3, 2012, 7:15 p.m. UTC | #42
On Mon, Jul 2, 2012 at 6:05 PM, Corey Bryant <coreyb@linux.vnet.ibm.com> wrote:
>
>
> On 06/28/2012 03:49 PM, Blue Swirl wrote:
>>
>> On Wed, Jun 27, 2012 at 9:25 PM, Anthony Liguori <anthony@codemonkey.ws>
>> wrote:
>>>
>>> On 06/21/2012 03:04 AM, Avi Kivity wrote:
>>>>
>>>>
>>>> On 06/19/2012 09:58 PM, Blue Swirl wrote:
>>>>>>>
>>>>>>>
>>>>>>> At least qemu-ifup/down scripts, migration exec and smbd have been
>>>>>>> mentioned. Only the system calls made by smbd (for some version of
>>>>>>> it)
>>>>>>> can be known. The user could specify arbitrary commands for the
>>>>>>> others, those could be assumed to use some common (large) subset of
>>>>>>> system calls but I think the security value would be close to zero
>>>>>>> then.
>>>>>>
>>>>>>
>>>>>>
>>>>>> We're not trying to protect against the user, but against the guest.
>>>>>> If
>>>>>> we assume the user wrote those scripts with care so they cannot be
>>>>>> exploited by the guest, then we are okay.
>>>>>
>>>>>
>>>>>
>>>>> My concern was that first we could accidentally filter a system call
>>>>> that changes the script or executable behavior, much like sendmail +
>>>>> capabilities bug, and then a guest could trigger running this
>>>>> script/executable and exploit the changed behavior.
>>>>
>>>>
>>>>
>>>> Ah, I see.  I agree this is dangerous.  We should probably disable exec
>>>> if we seccomp.
>>>
>>>
>>>
>>> There's no great place to jump into this thread so I guess I'll do it
>>> here.
>>>
>>> There is absolutely no doubt that white-listing syscalls that we
>>> currently
>>> use provides an improvement in security.
>>>
>>> We need to assume:
>>>
>>> 1) QEMU is run as an unprivileged user
>>>
>>> 2) QEMU is already heavily restricted by SELinux
>>>
>>> In this case, seccomp() is not being used to replace MAC or DAC.  It's
>>> supplementing both of them by additionally filtering out syscalls that
>>> may
>>> have unknown kernel exploits in them.  That's all this initial effort is
>>> about. Since it's scope is so limited, we can simply enable it
>>> unconditionally too.
>>
>>
>> I don't think the scope is limited in a safe way. What is the set of
>> system calls that can't ever cause problems to any possible ifup/down
>> scripts, migration exec helpers and various versions of smbd?
>>
>> For example, unlink() is missing. What if the ifup/down script needs
>> it for lock file cleanup? ftruncate()? Every socket syscalls in case
>> LDAP is used to access user information by the libc?
>>
>> I think we can't define the safe set, except 'allow all'. I'd propose
>> one of the following to avoid breakage:
>>
>> 1. Allow all system calls for the initial patch, refactor later to
>> reduce the set. Useless until refactored.
>
>
> One thing I like about starting with a known subset of syscalls used by QEMU
> is that it forces us to expand the whitelist if we come across more syscalls
> that QEMU uses.

Finding out what QEMU uses is the relatively easy part. Finding out
what the external helpers might use seems to be impossible.

>
> An issue with this approach is that if seccomp kills QEMU for using a
> disallowed syscall, I don't think we know what syscall it is.  (At least, I
> don't think it is accessible anywhere.)  This is good for security but makes
> it hard for developers who are debugging.
>
> Would it make sense to have the ability to configure QEMU in either:
> 1) seccomp kill mode (this is what the existing patches do), or
> 2) seccomp debug mode?
>
> In debug mode we could trap on the failing syscall (using SCMP_ACT_TRAP),
> determine the syscall value, and issue an error message that displays the
> syscall value.

I think that it would be nice and it would be useful also after any refactoring.

>
> The emulator() function here gives an idea of how this could be done:
> https://lkml.org/lkml/2012/4/12/449
>
>
>>
>> 2. Don't make seccomp mode enabled default, when enabled, forbid
>> execve(). Limits functionality when enabled, no security benefit if
>> not enabled.
>>
>> 3. Before enabling seccomp, fork a helper process without restrictions
>> that is used to launch other programs. Needs some work.
>>
>>>
>>> After we have this initial support, then we can look at a -sandbox
>>> option.
>>>   This open could prevent things like open()/execve() but that will come
>>> at a
>>> cost of features.
>>>
>>> I think the reasonable thing to do for -sandbox is to basically focus on
>>> the
>>> set of syscalls that QEMU would use if it were launched under libvirt.
>>> We
>>> should obviously make improvements (things like -blockdev) to make this
>>> even
>>> more restrictive.
>>>
>>> Who knows, maybe we end up having multiple types of sandboxes.  A
>>> '-sandbox
>>> libvirt' and a '-sandbox user' where the later is focused on the typical
>>> usage of an unprivileged user.
>>>
>>> But this is all stuff that can come later.  We solve a big problem by
>>> just
>>> getting the initial whitelist support in.
>>
>>
>> Fully agree, but we'd have to agree about what is a safe initial
>> whitelist.
>>
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>
>>>
>>>>
>>>>>>
>>>>>> We have decomposed qemu to some extent, in that privileged operations
>>>>>> happen in libvirt.  So the modes make sense - qemu has no idea whether
>>>>>> a
>>>>>> privileged management system is controlling it or not.
>>>>>
>>>>>
>>>>>
>>>>> So with -seccomp, libvirt could tell QEMU that for example open(),
>>>>> execve(), bind() and connect() will never be needed?
>>>>
>>>>
>>>>
>>>> Yes.
>>>>
>>>
>>
>
> --
> Regards,
> Corey
>
>
diff mbox

Patch

diff --git a/qemu-seccomp.c b/qemu-seccomp.c
new file mode 100644
index 0000000..048b7ba
--- /dev/null
+++ b/qemu-seccomp.c
@@ -0,0 +1,73 @@ 
+#include <stdio.h>
+#include <seccomp.h>
+#include "qemu-seccomp.h"
+
+static struct QemuSeccompSyscall seccomp_whitelist[] = {
+    { SCMP_SYS(timer_settime), 255 },
+    { SCMP_SYS(timer_gettime), 254 },
+    { SCMP_SYS(futex), 253 },
+    { SCMP_SYS(select), 252 },
+    { SCMP_SYS(recvfrom), 251 },
+    { SCMP_SYS(sendto), 250 },
+    { SCMP_SYS(read), 249 },
+    { SCMP_SYS(brk), 248 },
+    { SCMP_SYS(clone), 247 },
+    { SCMP_SYS(mmap), 247 },
+    { SCMP_SYS(mprotect), 246 },
+    { SCMP_SYS(ioctl), 245 },
+    { SCMP_SYS(recvmsg), 245 },
+    { SCMP_SYS(sendmsg), 245 },
+    { SCMP_SYS(accept), 245 },
+    { SCMP_SYS(connect), 245 },
+    { SCMP_SYS(bind), 245 },
+    { SCMP_SYS(listen), 245 },
+    { SCMP_SYS(ioctl), 245 },
+    { SCMP_SYS(eventfd), 245 },
+    { SCMP_SYS(rt_sigprocmask), 245 },
+    { SCMP_SYS(write), 244 },
+    { SCMP_SYS(fcntl), 243 },
+    { SCMP_SYS(tgkill), 242 },
+    { SCMP_SYS(rt_sigaction), 242 },
+    { SCMP_SYS(pipe2), 242 },
+    { SCMP_SYS(munmap), 242 },
+    { SCMP_SYS(mremap), 242 },
+    { SCMP_SYS(getsockname), 242 },
+    { SCMP_SYS(getpeername), 242 },
+    { SCMP_SYS(fdatasync), 242 },
+    { SCMP_SYS(close), 242 }
+};
+
+#define seccomp_whitelist_count ARRAY_SIZE(seccomp_whitelist)
+
+int seccomp_start(void)
+{
+    int rc = 0;
+    unsigned int i = 0;
+
+    rc = seccomp_init(SCMP_ACT_KILL);
+    if (rc < 0) {
+        goto seccomp_return;
+    }
+
+    for (i = 0; i < seccomp_whitelist_count; i++) {
+        rc = seccomp_rule_add(SCMP_ACT_ALLOW, seccomp_whitelist[i].num, 0);
+        if (rc < 0) {
+            goto seccomp_return;
+        }
+        rc = seccomp_syscall_priority(seccomp_whitelist[i].num,
+                                      seccomp_whitelist[i].priority);
+        if (rc < 0) {
+            goto seccomp_return;
+        }
+    }
+
+    rc = seccomp_load();
+
+  seccomp_return:
+    seccomp_release();
+    if (rc < 0) {
+        fprintf(stderr,
+                "ERROR: failed to configure the seccomp syscall filter in the kernel\n");
+    }
+    return rc;
+}
diff --git a/qemu-seccomp.h b/qemu-seccomp.h
new file mode 100644
index 0000000..3bbdd87
--- /dev/null
+++ b/qemu-seccomp.h
@@ -0,0 +1,9 @@ 
+#include <seccomp.h>
+#include "osdep.h"
+
+struct QemuSeccompSyscall {
+    int32_t num;
+    uint8_t priority;
+};
+
+int seccomp_start(void);
diff --git a/vl.c b/vl.c
index 204d85b..315afaf 100644
--- a/vl.c
+++ b/vl.c
@@ -61,6 +61,9 @@ 
 
 #include <linux/ppdev.h>
 #include <linux/parport.h>
+#ifdef CONFIG_LIBSECCOMP
+#include "qemu-seccomp.h"
+#endif
 #endif
 #ifdef __sun__
 #include <sys/stat.h>
@@ -2296,6 +2299,10 @@  int main(int argc, char **argv, char **envp)
     const char *trace_events = NULL;
     const char *trace_file = NULL;
 
+#ifdef CONFIG_LIBSECCOMP
+    seccomp_start();
+#endif
+
     atexit(qemu_run_exit_notifiers);
     error_set_progname(argv[0]);