Patchwork [PATCHv2,3/4] Support for "double whitelist" filters

login
register
mail settings
Submitter Eduardo Otubo
Date Oct. 23, 2012, 5:55 a.m.
Message ID <1350971732-16621-3-git-send-email-otubo@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/193358/
State New
Headers show

Comments

Eduardo Otubo - Oct. 23, 2012, 5:55 a.m.
This patch includes a second whitelist right before the main loop. It's
a smaller and more restricted whitelist, excluding execve() among many
others.

v2: * ctx changed to main_loop_ctx
    * seccomp_on now inside ifdef
    * open syscall added to the main_loop whitelist

Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
---
 qemu-seccomp.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
 qemu-seccomp.h |  7 ++++-
 vl.c           | 21 +++++++++++--
 3 files changed, 114 insertions(+), 13 deletions(-)
Corey Bryant - Oct. 23, 2012, 3:10 p.m.
On 10/23/2012 01:55 AM, Eduardo Otubo wrote:
> This patch includes a second whitelist right before the main loop. It's
> a smaller and more restricted whitelist, excluding execve() among many
> others.
>
> v2: * ctx changed to main_loop_ctx
>      * seccomp_on now inside ifdef
>      * open syscall added to the main_loop whitelist
>
> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> ---
>   qemu-seccomp.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
>   qemu-seccomp.h |  7 ++++-
>   vl.c           | 21 +++++++++++--
>   3 files changed, 114 insertions(+), 13 deletions(-)
>
> diff --git a/qemu-seccomp.c b/qemu-seccomp.c
> index a7b33e2..033cfad 100644
> --- a/qemu-seccomp.c
> +++ b/qemu-seccomp.c
> @@ -13,6 +13,7 @@
>    * GNU GPL, version 2 or (at your option) any later version.
>    */
>   #include <stdio.h>
> +#include <stdlib.h>
>   #include <seccomp.h>
>   #include "qemu-seccomp.h"
>
> @@ -21,7 +22,7 @@ struct QemuSeccompSyscall {
>       uint8_t priority;
>   };
>
> -static const struct QemuSeccompSyscall seccomp_whitelist[] = {
> +static const struct QemuSeccompSyscall seccomp_whitelist_init[] = {
>       { SCMP_SYS(timer_settime), 255 },
>       { SCMP_SYS(timer_gettime), 254 },
>       { SCMP_SYS(futex), 253 },
> @@ -121,27 +122,107 @@ static const struct QemuSeccompSyscall seccomp_whitelist[] = {
>       { SCMP_SYS(rt_sigtimedwait), 242 }
>   };
>
> -int seccomp_start(void)
> +static const struct QemuSeccompSyscall seccomp_whitelist_main_loop[] = {
> +    { SCMP_SYS(timer_settime), 255 },
> +    { SCMP_SYS(timer_gettime), 254 },
> +    { SCMP_SYS(futex), 253 },
> +    { SCMP_SYS(select), 252 },
> +    { SCMP_SYS(recvfrom), 251 },
> +    { SCMP_SYS(sendto), 250 },
> +    { SCMP_SYS(read), 249 },
> +    { SCMP_SYS(brk), 248 },
> +    { SCMP_SYS(mmap), 247 },
> +    { SCMP_SYS(open), 247 },
> +#if defined(__i386__)
> +    { SCMP_SYS(fcntl64), 245 },
> +    { SCMP_SYS(fstat64), 245 },
> +    { SCMP_SYS(stat64), 245 },
> +    { SCMP_SYS(getgid32), 245 },
> +    { SCMP_SYS(getegid32), 245 },
> +    { SCMP_SYS(getuid32), 245 },
> +    { SCMP_SYS(geteuid32), 245 },
> +    { SCMP_SYS(sigreturn), 245 },
> +    { SCMP_SYS(_newselect), 245 },
> +    { SCMP_SYS(_llseek), 245 },
> +    { SCMP_SYS(mmap2), 245},
> +    { SCMP_SYS(sigprocmask), 245 },
> +#endif
> +    { SCMP_SYS(exit), 245 },
> +    { SCMP_SYS(timer_delete), 245 },
> +    { SCMP_SYS(exit_group), 245 },
> +    { SCMP_SYS(rt_sigreturn), 245 },
> +    { SCMP_SYS(madvise), 245 },
> +    { SCMP_SYS(write), 244 },
> +    { SCMP_SYS(fcntl), 243 },
> +    { SCMP_SYS(tgkill), 242 },
> +    { SCMP_SYS(rt_sigaction), 242 },
> +    { SCMP_SYS(pipe2), 242 },
> +    { SCMP_SYS(munmap), 242 },
> +    { SCMP_SYS(mremap), 242 },
> +    { SCMP_SYS(getsockname), 242 },
> +    { SCMP_SYS(getpeername), 242 },
> +    { SCMP_SYS(close), 242 },
> +    { SCMP_SYS(accept4), 242 },
> +    { SCMP_SYS(eventfd2), 242 },
> +    { SCMP_SYS(recvmsg), 242 },
> +    { SCMP_SYS(ioctl), 242 },
> +    { SCMP_SYS(rt_sigprocmask), 242 }
> +};
> +
> +static int
> +process_whitelist(const struct QemuSeccompSyscall *whitelist,
> +                  unsigned int size, scmp_filter_ctx *ctx)
>   {
>       int rc = 0;
> +
>       unsigned int i = 0;
> -    scmp_filter_ctx ctx;
> +
> +    for (i = 0; i < size; i++) {
> +        rc = seccomp_rule_add(ctx, SCMP_ACT_ALLOW, whitelist[i].num, 0);
> +        if (rc < 0) {
> +            return -1;
> +        }
> +
> +        rc = seccomp_syscall_priority(ctx, whitelist[i].num,
> +                                      whitelist[i].priority);
> +        if (rc < 0) {
> +            return -1;
> +        }
> +    }
> +    return 0;
> +}
> +
> +int
> +seccomp_start(enum whitelist_mode mode, scmp_filter_ctx *ctx)
> +{
> +    int rc = 0;
>
>       ctx = seccomp_init(SCMP_ACT_KILL);

Is there any reason why ctx can't be a local variable in this function? 
  It is allocated and freed on each entry and exit in this function.

>       if (ctx == NULL) {
> +        rc = -1;
>           goto seccomp_return;
>       }
>
> -    for (i = 0; i < ARRAY_SIZE(seccomp_whitelist); i++) {
> -        rc = seccomp_rule_add(ctx, SCMP_ACT_ALLOW, seccomp_whitelist[i].num, 0);
> -        if (rc < 0) {
> +    switch (mode) {
> +    case INIT:
> +        if (process_whitelist
> +            (seccomp_whitelist_init,
> +             ARRAY_SIZE(seccomp_whitelist_init), ctx) < 0) {
> +            rc = -1;
>               goto seccomp_return;
>           }
> -        rc = seccomp_syscall_priority(ctx, seccomp_whitelist[i].num,
> -                                      seccomp_whitelist[i].priority);
> -        if (rc < 0) {
> +        break;
> +    case MAIN_LOOP:
> +        if (process_whitelist
> +            (seccomp_whitelist_main_loop,
> +             ARRAY_SIZE(seccomp_whitelist_main_loop), ctx) < 0) {
> +            rc = -1;
>               goto seccomp_return;
>           }
> +        break;
> +    default:
> +        rc = -1;
> +        goto seccomp_return;
>       }
>
>       rc = seccomp_load(ctx);
> diff --git a/qemu-seccomp.h b/qemu-seccomp.h
> index b2fc3f8..1c97978 100644
> --- a/qemu-seccomp.h
> +++ b/qemu-seccomp.h
> @@ -18,5 +18,10 @@
>   #include <seccomp.h>
>   #include "osdep.h"
>
> -int seccomp_start(void);
> +enum whitelist_mode {
> +    INIT = 0,
> +    MAIN_LOOP = 1,
> +};
> +
> +int seccomp_start(enum whitelist_mode mode, scmp_filter_ctx *ctx);
>   #endif
> diff --git a/vl.c b/vl.c
> index bec68cd..d50018f 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -774,10 +774,11 @@ static int bt_parse(const char *opt)
>       return 1;
>   }
>
> -static int install_seccomp_filters(void)
> +static int
> +install_seccomp_filters(enum whitelist_mode mode, scmp_filter_ctx *ctx)
>   {
>   #ifdef CONFIG_SECCOMP
> -    if (seccomp_start() < 0) {
> +    if (seccomp_start(mode, ctx) < 0) {
>           qerror_report(ERROR_CLASS_GENERIC_ERROR,
>                   "failed to install seccomp syscall filter in the kernel");

I heard from Luiz Capitulino on one of my patches that qerror_report() 
is deprecated.  So you'll want to update this.

>           return -1;
> @@ -2407,6 +2408,10 @@ int main(int argc, char **argv, char **envp)
>       const char *trace_events = NULL;
>       const char *trace_file = NULL;
>
> +#ifdef CONFIG_SECCOMP
> +    scmp_filter_ctx main_loop_ctx;
> +#endif
> +
>       atexit(qemu_run_exit_notifiers);
>       error_set_progname(argv[0]);
>
> @@ -3330,11 +3335,13 @@ int main(int argc, char **argv, char **envp)
>       }
>
>       /* We should install seccomp filters even if -sandbox on is not used. */
> +#ifdef CONFIG_SECCOMP
>       if (seccomp_on) {
> -        if (install_seccomp_filters() < 0) {
> +        if (install_seccomp_filters(INIT, &main_loop_ctx) < 0) {

I don't think the variable name "main_loop_ctx" makes sense here. 
Should the name be more generic since it's used wherever a seccomp 
filter is installed?

>               exit(1);
>           }
>       }
> +#endif
>
>       if (machine == NULL) {
>           fprintf(stderr, "No machine found.\n");
> @@ -3794,6 +3801,14 @@ int main(int argc, char **argv, char **envp)
>
>       os_setup_post();
>
> +#ifdef CONFIG_SECCOMP
> +    if (seccomp_on) {
> +        if (install_seccomp_filters(MAIN_LOOP, &main_loop_ctx) < 0) {
> +            exit(1);
> +        }
> +    }
> +#endif
> +
>       resume_all_vcpus();
>       main_loop();
>       bdrv_close_all();
>
Eduardo Otubo - Oct. 24, 2012, 8:06 p.m.
On Tue, Oct 23, 2012 at 11:10:58AM -0400, Corey Bryant wrote:
> 
> 
> On 10/23/2012 01:55 AM, Eduardo Otubo wrote:
> >This patch includes a second whitelist right before the main loop. It's
> >a smaller and more restricted whitelist, excluding execve() among many
> >others.
> >
> >v2: * ctx changed to main_loop_ctx
> >     * seccomp_on now inside ifdef
> >     * open syscall added to the main_loop whitelist
> >
> >Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> >---
> >  qemu-seccomp.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
> >  qemu-seccomp.h |  7 ++++-
> >  vl.c           | 21 +++++++++++--
> >  3 files changed, 114 insertions(+), 13 deletions(-)
> >
> >diff --git a/qemu-seccomp.c b/qemu-seccomp.c
> >index a7b33e2..033cfad 100644
> >--- a/qemu-seccomp.c
> >+++ b/qemu-seccomp.c
> >@@ -13,6 +13,7 @@
> >   * GNU GPL, version 2 or (at your option) any later version.
> >   */
> >  #include <stdio.h>
> >+#include <stdlib.h>
> >  #include <seccomp.h>
> >  #include "qemu-seccomp.h"
> >
> >@@ -21,7 +22,7 @@ struct QemuSeccompSyscall {
> >      uint8_t priority;
> >  };
> >
> >-static const struct QemuSeccompSyscall seccomp_whitelist[] = {
> >+static const struct QemuSeccompSyscall seccomp_whitelist_init[] = {
> >      { SCMP_SYS(timer_settime), 255 },
> >      { SCMP_SYS(timer_gettime), 254 },
> >      { SCMP_SYS(futex), 253 },
> >@@ -121,27 +122,107 @@ static const struct QemuSeccompSyscall seccomp_whitelist[] = {
> >      { SCMP_SYS(rt_sigtimedwait), 242 }
> >  };
> >
> >-int seccomp_start(void)
> >+static const struct QemuSeccompSyscall seccomp_whitelist_main_loop[] = {
> >+    { SCMP_SYS(timer_settime), 255 },
> >+    { SCMP_SYS(timer_gettime), 254 },
> >+    { SCMP_SYS(futex), 253 },
> >+    { SCMP_SYS(select), 252 },
> >+    { SCMP_SYS(recvfrom), 251 },
> >+    { SCMP_SYS(sendto), 250 },
> >+    { SCMP_SYS(read), 249 },
> >+    { SCMP_SYS(brk), 248 },
> >+    { SCMP_SYS(mmap), 247 },
> >+    { SCMP_SYS(open), 247 },
> >+#if defined(__i386__)
> >+    { SCMP_SYS(fcntl64), 245 },
> >+    { SCMP_SYS(fstat64), 245 },
> >+    { SCMP_SYS(stat64), 245 },
> >+    { SCMP_SYS(getgid32), 245 },
> >+    { SCMP_SYS(getegid32), 245 },
> >+    { SCMP_SYS(getuid32), 245 },
> >+    { SCMP_SYS(geteuid32), 245 },
> >+    { SCMP_SYS(sigreturn), 245 },
> >+    { SCMP_SYS(_newselect), 245 },
> >+    { SCMP_SYS(_llseek), 245 },
> >+    { SCMP_SYS(mmap2), 245},
> >+    { SCMP_SYS(sigprocmask), 245 },
> >+#endif
> >+    { SCMP_SYS(exit), 245 },
> >+    { SCMP_SYS(timer_delete), 245 },
> >+    { SCMP_SYS(exit_group), 245 },
> >+    { SCMP_SYS(rt_sigreturn), 245 },
> >+    { SCMP_SYS(madvise), 245 },
> >+    { SCMP_SYS(write), 244 },
> >+    { SCMP_SYS(fcntl), 243 },
> >+    { SCMP_SYS(tgkill), 242 },
> >+    { SCMP_SYS(rt_sigaction), 242 },
> >+    { SCMP_SYS(pipe2), 242 },
> >+    { SCMP_SYS(munmap), 242 },
> >+    { SCMP_SYS(mremap), 242 },
> >+    { SCMP_SYS(getsockname), 242 },
> >+    { SCMP_SYS(getpeername), 242 },
> >+    { SCMP_SYS(close), 242 },
> >+    { SCMP_SYS(accept4), 242 },
> >+    { SCMP_SYS(eventfd2), 242 },
> >+    { SCMP_SYS(recvmsg), 242 },
> >+    { SCMP_SYS(ioctl), 242 },
> >+    { SCMP_SYS(rt_sigprocmask), 242 }
> >+};
> >+
> >+static int
> >+process_whitelist(const struct QemuSeccompSyscall *whitelist,
> >+                  unsigned int size, scmp_filter_ctx *ctx)
> >  {
> >      int rc = 0;
> >+
> >      unsigned int i = 0;
> >-    scmp_filter_ctx ctx;
> >+
> >+    for (i = 0; i < size; i++) {
> >+        rc = seccomp_rule_add(ctx, SCMP_ACT_ALLOW, whitelist[i].num, 0);
> >+        if (rc < 0) {
> >+            return -1;
> >+        }
> >+
> >+        rc = seccomp_syscall_priority(ctx, whitelist[i].num,
> >+                                      whitelist[i].priority);
> >+        if (rc < 0) {
> >+            return -1;
> >+        }
> >+    }
> >+    return 0;
> >+}
> >+
> >+int
> >+seccomp_start(enum whitelist_mode mode, scmp_filter_ctx *ctx)
> >+{
> >+    int rc = 0;
> >
> >      ctx = seccomp_init(SCMP_ACT_KILL);
> 
> Is there any reason why ctx can't be a local variable in this
> function?  It is allocated and freed on each entry and exit in this
> function.

I think you're probaby right. I'll declare this variable as local in the
next version.

> 
> >      if (ctx == NULL) {
> >+        rc = -1;
> >          goto seccomp_return;
> >      }
> >
> >-    for (i = 0; i < ARRAY_SIZE(seccomp_whitelist); i++) {
> >-        rc = seccomp_rule_add(ctx, SCMP_ACT_ALLOW, seccomp_whitelist[i].num, 0);
> >-        if (rc < 0) {
> >+    switch (mode) {
> >+    case INIT:
> >+        if (process_whitelist
> >+            (seccomp_whitelist_init,
> >+             ARRAY_SIZE(seccomp_whitelist_init), ctx) < 0) {
> >+            rc = -1;
> >              goto seccomp_return;
> >          }
> >-        rc = seccomp_syscall_priority(ctx, seccomp_whitelist[i].num,
> >-                                      seccomp_whitelist[i].priority);
> >-        if (rc < 0) {
> >+        break;
> >+    case MAIN_LOOP:
> >+        if (process_whitelist
> >+            (seccomp_whitelist_main_loop,
> >+             ARRAY_SIZE(seccomp_whitelist_main_loop), ctx) < 0) {
> >+            rc = -1;
> >              goto seccomp_return;
> >          }
> >+        break;
> >+    default:
> >+        rc = -1;
> >+        goto seccomp_return;
> >      }
> >
> >      rc = seccomp_load(ctx);
> >diff --git a/qemu-seccomp.h b/qemu-seccomp.h
> >index b2fc3f8..1c97978 100644
> >--- a/qemu-seccomp.h
> >+++ b/qemu-seccomp.h
> >@@ -18,5 +18,10 @@
> >  #include <seccomp.h>
> >  #include "osdep.h"
> >
> >-int seccomp_start(void);
> >+enum whitelist_mode {
> >+    INIT = 0,
> >+    MAIN_LOOP = 1,
> >+};
> >+
> >+int seccomp_start(enum whitelist_mode mode, scmp_filter_ctx *ctx);
> >  #endif
> >diff --git a/vl.c b/vl.c
> >index bec68cd..d50018f 100644
> >--- a/vl.c
> >+++ b/vl.c
> >@@ -774,10 +774,11 @@ static int bt_parse(const char *opt)
> >      return 1;
> >  }
> >
> >-static int install_seccomp_filters(void)
> >+static int
> >+install_seccomp_filters(enum whitelist_mode mode, scmp_filter_ctx *ctx)
> >  {
> >  #ifdef CONFIG_SECCOMP
> >-    if (seccomp_start() < 0) {
> >+    if (seccomp_start(mode, ctx) < 0) {
> >          qerror_report(ERROR_CLASS_GENERIC_ERROR,
> >                  "failed to install seccomp syscall filter in the kernel");
> 
> I heard from Luiz Capitulino on one of my patches that
> qerror_report() is deprecated.  So you'll want to update this.
> 
> >          return -1;
> >@@ -2407,6 +2408,10 @@ int main(int argc, char **argv, char **envp)
> >      const char *trace_events = NULL;
> >      const char *trace_file = NULL;
> >
> >+#ifdef CONFIG_SECCOMP
> >+    scmp_filter_ctx main_loop_ctx;
> >+#endif
> >+
> >      atexit(qemu_run_exit_notifiers);
> >      error_set_progname(argv[0]);
> >
> >@@ -3330,11 +3335,13 @@ int main(int argc, char **argv, char **envp)
> >      }
> >
> >      /* We should install seccomp filters even if -sandbox on is not used. */
> >+#ifdef CONFIG_SECCOMP
> >      if (seccomp_on) {
> >-        if (install_seccomp_filters() < 0) {
> >+        if (install_seccomp_filters(INIT, &main_loop_ctx) < 0) {
> 
> I don't think the variable name "main_loop_ctx" makes sense here.
> Should the name be more generic since it's used wherever a seccomp
> filter is installed?

I removed this variable since I'm using a local variable as I said
above.

Thanks for the comments :)
Eduardo Otubo - Oct. 25, 2012, 8:16 p.m.
On Tue, Oct 23, 2012 at 11:10:58AM -0400, Corey Bryant wrote:
> 
> 
> On 10/23/2012 01:55 AM, Eduardo Otubo wrote:
> >This patch includes a second whitelist right before the main loop. It's
> >a smaller and more restricted whitelist, excluding execve() among many
> >others.
> >
> >v2: * ctx changed to main_loop_ctx
> >     * seccomp_on now inside ifdef
> >     * open syscall added to the main_loop whitelist
> >
> >Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> >---
> >  qemu-seccomp.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
> >  qemu-seccomp.h |  7 ++++-
> >  vl.c           | 21 +++++++++++--
> >  3 files changed, 114 insertions(+), 13 deletions(-)
> >
> >diff --git a/qemu-seccomp.c b/qemu-seccomp.c
> >index a7b33e2..033cfad 100644
> >--- a/qemu-seccomp.c
> >+++ b/qemu-seccomp.c
> >@@ -13,6 +13,7 @@
> >   * GNU GPL, version 2 or (at your option) any later version.
> >   */
> >  #include <stdio.h>
> >+#include <stdlib.h>
> >  #include <seccomp.h>
> >  #include "qemu-seccomp.h"
> >
> >@@ -21,7 +22,7 @@ struct QemuSeccompSyscall {
> >      uint8_t priority;
> >  };
> >
> >-static const struct QemuSeccompSyscall seccomp_whitelist[] = {
> >+static const struct QemuSeccompSyscall seccomp_whitelist_init[] = {
> >      { SCMP_SYS(timer_settime), 255 },
> >      { SCMP_SYS(timer_gettime), 254 },
> >      { SCMP_SYS(futex), 253 },
> >@@ -121,27 +122,107 @@ static const struct QemuSeccompSyscall seccomp_whitelist[] = {
> >      { SCMP_SYS(rt_sigtimedwait), 242 }
> >  };
> >
> >-int seccomp_start(void)
> >+static const struct QemuSeccompSyscall seccomp_whitelist_main_loop[] = {
> >+    { SCMP_SYS(timer_settime), 255 },
> >+    { SCMP_SYS(timer_gettime), 254 },
> >+    { SCMP_SYS(futex), 253 },
> >+    { SCMP_SYS(select), 252 },
> >+    { SCMP_SYS(recvfrom), 251 },
> >+    { SCMP_SYS(sendto), 250 },
> >+    { SCMP_SYS(read), 249 },
> >+    { SCMP_SYS(brk), 248 },
> >+    { SCMP_SYS(mmap), 247 },
> >+    { SCMP_SYS(open), 247 },
> >+#if defined(__i386__)
> >+    { SCMP_SYS(fcntl64), 245 },
> >+    { SCMP_SYS(fstat64), 245 },
> >+    { SCMP_SYS(stat64), 245 },
> >+    { SCMP_SYS(getgid32), 245 },
> >+    { SCMP_SYS(getegid32), 245 },
> >+    { SCMP_SYS(getuid32), 245 },
> >+    { SCMP_SYS(geteuid32), 245 },
> >+    { SCMP_SYS(sigreturn), 245 },
> >+    { SCMP_SYS(_newselect), 245 },
> >+    { SCMP_SYS(_llseek), 245 },
> >+    { SCMP_SYS(mmap2), 245},
> >+    { SCMP_SYS(sigprocmask), 245 },
> >+#endif
> >+    { SCMP_SYS(exit), 245 },
> >+    { SCMP_SYS(timer_delete), 245 },
> >+    { SCMP_SYS(exit_group), 245 },
> >+    { SCMP_SYS(rt_sigreturn), 245 },
> >+    { SCMP_SYS(madvise), 245 },
> >+    { SCMP_SYS(write), 244 },
> >+    { SCMP_SYS(fcntl), 243 },
> >+    { SCMP_SYS(tgkill), 242 },
> >+    { SCMP_SYS(rt_sigaction), 242 },
> >+    { SCMP_SYS(pipe2), 242 },
> >+    { SCMP_SYS(munmap), 242 },
> >+    { SCMP_SYS(mremap), 242 },
> >+    { SCMP_SYS(getsockname), 242 },
> >+    { SCMP_SYS(getpeername), 242 },
> >+    { SCMP_SYS(close), 242 },
> >+    { SCMP_SYS(accept4), 242 },
> >+    { SCMP_SYS(eventfd2), 242 },
> >+    { SCMP_SYS(recvmsg), 242 },
> >+    { SCMP_SYS(ioctl), 242 },
> >+    { SCMP_SYS(rt_sigprocmask), 242 }
> >+};
> >+
> >+static int
> >+process_whitelist(const struct QemuSeccompSyscall *whitelist,
> >+                  unsigned int size, scmp_filter_ctx *ctx)
> >  {
> >      int rc = 0;
> >+
> >      unsigned int i = 0;
> >-    scmp_filter_ctx ctx;
> >+
> >+    for (i = 0; i < size; i++) {
> >+        rc = seccomp_rule_add(ctx, SCMP_ACT_ALLOW, whitelist[i].num, 0);
> >+        if (rc < 0) {
> >+            return -1;
> >+        }
> >+
> >+        rc = seccomp_syscall_priority(ctx, whitelist[i].num,
> >+                                      whitelist[i].priority);
> >+        if (rc < 0) {
> >+            return -1;
> >+        }
> >+    }
> >+    return 0;
> >+}
> >+
> >+int
> >+seccomp_start(enum whitelist_mode mode, scmp_filter_ctx *ctx)
> >+{
> >+    int rc = 0;
> >
> >      ctx = seccomp_init(SCMP_ACT_KILL);
> 
> Is there any reason why ctx can't be a local variable in this
> function?  It is allocated and freed on each entry and exit in this
> function.
> 

It does make sense. I'll make it local in the next version, thanks.

> >      if (ctx == NULL) {
> >+        rc = -1;
> >          goto seccomp_return;
> >      }
> >
> >-    for (i = 0; i < ARRAY_SIZE(seccomp_whitelist); i++) {
> >-        rc = seccomp_rule_add(ctx, SCMP_ACT_ALLOW, seccomp_whitelist[i].num, 0);
> >-        if (rc < 0) {
> >+    switch (mode) {
> >+    case INIT:
> >+        if (process_whitelist
> >+            (seccomp_whitelist_init,
> >+             ARRAY_SIZE(seccomp_whitelist_init), ctx) < 0) {
> >+            rc = -1;
> >              goto seccomp_return;
> >          }
> >-        rc = seccomp_syscall_priority(ctx, seccomp_whitelist[i].num,
> >-                                      seccomp_whitelist[i].priority);
> >-        if (rc < 0) {
> >+        break;
> >+    case MAIN_LOOP:
> >+        if (process_whitelist
> >+            (seccomp_whitelist_main_loop,
> >+             ARRAY_SIZE(seccomp_whitelist_main_loop), ctx) < 0) {
> >+            rc = -1;
> >              goto seccomp_return;
> >          }
> >+        break;
> >+    default:
> >+        rc = -1;
> >+        goto seccomp_return;
> >      }
> >
> >      rc = seccomp_load(ctx);
> >diff --git a/qemu-seccomp.h b/qemu-seccomp.h
> >index b2fc3f8..1c97978 100644
> >--- a/qemu-seccomp.h
> >+++ b/qemu-seccomp.h
> >@@ -18,5 +18,10 @@
> >  #include <seccomp.h>
> >  #include "osdep.h"
> >
> >-int seccomp_start(void);
> >+enum whitelist_mode {
> >+    INIT = 0,
> >+    MAIN_LOOP = 1,
> >+};
> >+
> >+int seccomp_start(enum whitelist_mode mode, scmp_filter_ctx *ctx);
> >  #endif
> >diff --git a/vl.c b/vl.c
> >index bec68cd..d50018f 100644
> >--- a/vl.c
> >+++ b/vl.c
> >@@ -774,10 +774,11 @@ static int bt_parse(const char *opt)
> >      return 1;
> >  }
> >
> >-static int install_seccomp_filters(void)
> >+static int
> >+install_seccomp_filters(enum whitelist_mode mode, scmp_filter_ctx *ctx)
> >  {
> >  #ifdef CONFIG_SECCOMP
> >-    if (seccomp_start() < 0) {
> >+    if (seccomp_start(mode, ctx) < 0) {
> >          qerror_report(ERROR_CLASS_GENERIC_ERROR,
> >                  "failed to install seccomp syscall filter in the kernel");
> 
> I heard from Luiz Capitulino on one of my patches that
> qerror_report() is deprecated.  So you'll want to update this.

Luiz Capitulino commented on the IRC that fprintf(stderr) is fine for
vl.c. I'll use that.

> 
> >          return -1;
> >@@ -2407,6 +2408,10 @@ int main(int argc, char **argv, char **envp)
> >      const char *trace_events = NULL;
> >      const char *trace_file = NULL;
> >
> >+#ifdef CONFIG_SECCOMP
> >+    scmp_filter_ctx main_loop_ctx;
> >+#endif
> >+
> >      atexit(qemu_run_exit_notifiers);
> >      error_set_progname(argv[0]);
> >
> >@@ -3330,11 +3335,13 @@ int main(int argc, char **argv, char **envp)
> >      }
> >
> >      /* We should install seccomp filters even if -sandbox on is not used. */
> >+#ifdef CONFIG_SECCOMP
> >      if (seccomp_on) {
> >-        if (install_seccomp_filters() < 0) {
> >+        if (install_seccomp_filters(INIT, &main_loop_ctx) < 0) {
> 
> I don't think the variable name "main_loop_ctx" makes sense here.
> Should the name be more generic since it's used wherever a seccomp
> filter is installed?

since the ctx variable now is local in the function
install_seccomp_filters() I'll remove this reference in the next
version.

Thanks for all the comments :)
Paul Moore - Nov. 2, 2012, 9:29 p.m.
On Tuesday, October 23, 2012 03:55:31 AM Eduardo Otubo wrote:
> This patch includes a second whitelist right before the main loop. It's
> a smaller and more restricted whitelist, excluding execve() among many
> others.
> 
> v2: * ctx changed to main_loop_ctx
>     * seccomp_on now inside ifdef
>     * open syscall added to the main_loop whitelist
> 
> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>

Unfortunately qemu.org seems to be down for me today so I can't grab the 
latest repo to review/verify this patch (some of my comments/assumptions below 
may be off) but I'm a little confused, hopefully you guys can help me out, 
read below ...

The first call to seccomp_install_filter() will setup a whitelist for the 
syscalls that have been explicitly specified, all others will hit the default 
action TRAP/KILL.  The second call to seccomp_install_filter() will add a 
second whitelist for another set of explicitly specified syscalls, all others 
will hit the default action TRAP/KILL.

The problem occurs when the filters are executed in the kernel when a syscall 
is executed.  On each syscall the first filter will be executed and the action 
will either be ALLOW or TRAP/KILL, next the second filter will be executed and 
the action will either be ALLOW or TRAP/KILL; since the kernel always takes 
the most restrictive (lowest integer action value) action when multiple 
filters are specified, I think your double whitelist value is going to have 
some inherent problems.  I might suggest an initial, fairly permissive 
whitelist followed by a follow-on blacklist if you want to disable certain 
syscalls.
Corey Bryant - Nov. 2, 2012, 10 p.m.
On 11/02/2012 05:29 PM, Paul Moore wrote:
> On Tuesday, October 23, 2012 03:55:31 AM Eduardo Otubo wrote:
>> This patch includes a second whitelist right before the main loop. It's
>> a smaller and more restricted whitelist, excluding execve() among many
>> others.
>>
>> v2: * ctx changed to main_loop_ctx
>>      * seccomp_on now inside ifdef
>>      * open syscall added to the main_loop whitelist
>>
>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
>
> Unfortunately qemu.org seems to be down for me today so I can't grab the
> latest repo to review/verify this patch (some of my comments/assumptions below
> may be off) but I'm a little confused, hopefully you guys can help me out,
> read below ...
>
> The first call to seccomp_install_filter() will setup a whitelist for the
> syscalls that have been explicitly specified, all others will hit the default
> action TRAP/KILL.  The second call to seccomp_install_filter() will add a
> second whitelist for another set of explicitly specified syscalls, all others
> will hit the default action TRAP/KILL.

That's correct.  The goal was to have a 2nd list that is a subset of the 
1st list, and also not include execve() in the 2nd list.  At this point 
though, since it's late in the release, we've expanded the 2nd list to 
be the same as the 1st with the exception of execve() not being in the 
2nd list.

>
> The problem occurs when the filters are executed in the kernel when a syscall
> is executed.  On each syscall the first filter will be executed and the action
> will either be ALLOW or TRAP/KILL, next the second filter will be executed and
> the action will either be ALLOW or TRAP/KILL; since the kernel always takes
> the most restrictive (lowest integer action value) action when multiple
> filters are specified, I think your double whitelist value is going to have
> some inherent problems.

That's something I hadn't thought of.  But TRAP and KILL won't exist 
together in our whitelists, and our 2nd whitelist is a subset of the 
1st.  So do you think there would still be problems?

> I might suggest an initial, fairly permissive
> whitelist followed by a follow-on blacklist if you want to disable certain
> syscalls.
>

I have to admit I'm nervous about this at this point in QEMU 1.3.  It's 
getting late in the cycle and we'd hoped to get this in earlier.  A more 
permissive whitelist is probably going to be the only way we'll 
successfully turn -sandbox on by default at this point in QEMU 1.3.
Anthony Liguori - Nov. 2, 2012, 10:01 p.m.
Paul Moore <pmoore@redhat.com> writes:

> On Tuesday, October 23, 2012 03:55:31 AM Eduardo Otubo wrote:
>> This patch includes a second whitelist right before the main loop. It's
>> a smaller and more restricted whitelist, excluding execve() among many
>> others.
>> 
>> v2: * ctx changed to main_loop_ctx
>>     * seccomp_on now inside ifdef
>>     * open syscall added to the main_loop whitelist
>> 
>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
>
> Unfortunately qemu.org seems to be down for me today so I can't grab
> the 

qemu.org is up, just having DNS problems.  Use git.qemu-project.org
instead and you should be fine.

Regards,

Anthony Liguori

> latest repo to review/verify this patch (some of my comments/assumptions below 
> may be off) but I'm a little confused, hopefully you guys can help me out, 
> read below ...
>
> The first call to seccomp_install_filter() will setup a whitelist for the 
> syscalls that have been explicitly specified, all others will hit the default 
> action TRAP/KILL.  The second call to seccomp_install_filter() will add a 
> second whitelist for another set of explicitly specified syscalls, all others 
> will hit the default action TRAP/KILL.
>
> The problem occurs when the filters are executed in the kernel when a syscall 
> is executed.  On each syscall the first filter will be executed and the action 
> will either be ALLOW or TRAP/KILL, next the second filter will be executed and 
> the action will either be ALLOW or TRAP/KILL; since the kernel always takes 
> the most restrictive (lowest integer action value) action when multiple 
> filters are specified, I think your double whitelist value is going to have 
> some inherent problems.  I might suggest an initial, fairly permissive 
> whitelist followed by a follow-on blacklist if you want to disable certain 
> syscalls.
>
> -- 
> paul moore
> security and virtualization @ redhat
Paul Moore - Nov. 2, 2012, 10:14 p.m.
On Friday, November 02, 2012 06:00:29 PM Corey Bryant wrote:
> On 11/02/2012 05:29 PM, Paul Moore wrote:
> > On Tuesday, October 23, 2012 03:55:31 AM Eduardo Otubo wrote:
> >> This patch includes a second whitelist right before the main loop. It's
> >> a smaller and more restricted whitelist, excluding execve() among many
> >> others.
> >> 
> >> v2: * ctx changed to main_loop_ctx
> >> 
> >>      * seccomp_on now inside ifdef
> >>      * open syscall added to the main_loop whitelist
> >> 
> >> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> > 
> > Unfortunately qemu.org seems to be down for me today so I can't grab the
> > latest repo to review/verify this patch (some of my comments/assumptions
> > below may be off) but I'm a little confused, hopefully you guys can help
> > me out, read below ...
> > 
> > The first call to seccomp_install_filter() will setup a whitelist for the
> > syscalls that have been explicitly specified, all others will hit the
> > default action TRAP/KILL.  The second call to seccomp_install_filter()
> > will add a second whitelist for another set of explicitly specified
> > syscalls, all others will hit the default action TRAP/KILL.
> 
> That's correct.  The goal was to have a 2nd list that is a subset of the
> 1st list, and also not include execve() in the 2nd list.  At this point
> though, since it's late in the release, we've expanded the 2nd list to
> be the same as the 1st with the exception of execve() not being in the
> 2nd list.
> 
> > The problem occurs when the filters are executed in the kernel when a
> > syscall is executed.  On each syscall the first filter will be executed
> > and the action will either be ALLOW or TRAP/KILL, next the second filter
> > will be executed and the action will either be ALLOW or TRAP/KILL; since
> > the kernel always takes the most restrictive (lowest integer action
> > value) action when multiple filters are specified, I think your double
> > whitelist value is going to have some inherent problems.
> 
> That's something I hadn't thought of.  But TRAP and KILL won't exist
> together in our whitelists, and our 2nd whitelist is a subset of the
> 1st.  So do you think there would still be problems?

It doesn't really matter if the default action is TRAP and/or KILL, the point 
is that if you use a second whitelist after an initial whitelist the effective 
seccomp filter is going to be only the syscalls you explicitly allowed in the 
second whitelist.  When using multiple seccomp filters on a process, all 
filters are executed for each syscall and the most restrictive action of all 
the filters is the action that the kernel takes.

Don't get me wrong, I like the idea of progressively restricting QEMU, but if 
you are going to load multiple seccomp filters into the kernel, you almost 
certainly only want the first whitelist filter to be the union of all the 
seccomp filter you intend to load with all subsequent filters being blacklists 
which progressively remove syscalls which are allowed by the initial 
whitelist.

> > I might suggest an initial, fairly permissive
> > whitelist followed by a follow-on blacklist if you want to disable certain
> > syscalls.
> 
> I have to admit I'm nervous about this at this point in QEMU 1.3.  It's
> getting late in the cycle and we'd hoped to get this in earlier.  A more
> permissive whitelist is probably going to be the only way we'll
> successfully turn -sandbox on by default at this point in QEMU 1.3.

Thats fine, I just wanted to point out that I think the multiple whitelist 
approach is going to have some inherent problems.
Corey Bryant - Nov. 5, 2012, 2:39 p.m.
On 11/02/2012 06:14 PM, Paul Moore wrote:
> On Friday, November 02, 2012 06:00:29 PM Corey Bryant wrote:
>> On 11/02/2012 05:29 PM, Paul Moore wrote:
>>> On Tuesday, October 23, 2012 03:55:31 AM Eduardo Otubo wrote:
>>>> This patch includes a second whitelist right before the main loop. It's
>>>> a smaller and more restricted whitelist, excluding execve() among many
>>>> others.
>>>>
>>>> v2: * ctx changed to main_loop_ctx
>>>>
>>>>       * seccomp_on now inside ifdef
>>>>       * open syscall added to the main_loop whitelist
>>>>
>>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
>>>
>>> Unfortunately qemu.org seems to be down for me today so I can't grab the
>>> latest repo to review/verify this patch (some of my comments/assumptions
>>> below may be off) but I'm a little confused, hopefully you guys can help
>>> me out, read below ...
>>>
>>> The first call to seccomp_install_filter() will setup a whitelist for the
>>> syscalls that have been explicitly specified, all others will hit the
>>> default action TRAP/KILL.  The second call to seccomp_install_filter()
>>> will add a second whitelist for another set of explicitly specified
>>> syscalls, all others will hit the default action TRAP/KILL.
>>
>> That's correct.  The goal was to have a 2nd list that is a subset of the
>> 1st list, and also not include execve() in the 2nd list.  At this point
>> though, since it's late in the release, we've expanded the 2nd list to
>> be the same as the 1st with the exception of execve() not being in the
>> 2nd list.
>>
>>> The problem occurs when the filters are executed in the kernel when a
>>> syscall is executed.  On each syscall the first filter will be executed
>>> and the action will either be ALLOW or TRAP/KILL, next the second filter
>>> will be executed and the action will either be ALLOW or TRAP/KILL; since
>>> the kernel always takes the most restrictive (lowest integer action
>>> value) action when multiple filters are specified, I think your double
>>> whitelist value is going to have some inherent problems.
>>
>> That's something I hadn't thought of.  But TRAP and KILL won't exist
>> together in our whitelists, and our 2nd whitelist is a subset of the
>> 1st.  So do you think there would still be problems?
>
> It doesn't really matter if the default action is TRAP and/or KILL, the point
> is that if you use a second whitelist after an initial whitelist the effective
> seccomp filter is going to be only the syscalls you explicitly allowed in the
> second whitelist.  When using multiple seccomp filters on a process, all
> filters are executed for each syscall and the most restrictive action of all
> the filters is the action that the kernel takes.
>
> Don't get me wrong, I like the idea of progressively restricting QEMU, but if
> you are going to load multiple seccomp filters into the kernel, you almost
> certainly only want the first whitelist filter to be the union of all the
> seccomp filter you intend to load with all subsequent filters being blacklists
> which progressively remove syscalls which are allowed by the initial
> whitelist.
>

That's what we're doing though.  The first whitelist is a union of all 
subsequent filters.  Of course there's only one subsequent filter at 
this point.  But the idea is to start out with a large whitelist for 
initialization and then tighten it up before the main loop when 
presumably less syscalls are needed.

My concern is getting the two whitelists correct.  We keep uncovering 
new syscalls as we test.

>>> I might suggest an initial, fairly permissive
>>> whitelist followed by a follow-on blacklist if you want to disable certain
>>> syscalls.
>>
>> I have to admit I'm nervous about this at this point in QEMU 1.3.  It's
>> getting late in the cycle and we'd hoped to get this in earlier.  A more
>> permissive whitelist is probably going to be the only way we'll
>> successfully turn -sandbox on by default at this point in QEMU 1.3.
>
> Thats fine, I just wanted to point out that I think the multiple whitelist
> approach is going to have some inherent problems.
>

Are you thinking there will be problems with the current two-whitelist 
approach, or are you thinking there would be problems in the future if 
we continued restricting the QEMU process with further whitelists?  If 
you mean the latter, then I understand your point since QEMU is a single 
process that requires a certain subset of syscalls.

I'm thinking once the two whitelists are in place, we can move on to 
restricting syscall parameters in the existing whitelists where it makes 
sense, and then look into your original decomposition approach, where 
parts of qemu are run in separate threads/processes which would allow 
much tighter seccomp restriction.

What do you think?
Paul Moore - Nov. 5, 2012, 9:58 p.m.
On Monday, November 05, 2012 09:39:46 AM Corey Bryant wrote:
> On 11/02/2012 06:14 PM, Paul Moore wrote:
> > On Friday, November 02, 2012 06:00:29 PM Corey Bryant wrote:
> >> On 11/02/2012 05:29 PM, Paul Moore wrote:
> >>> On Tuesday, October 23, 2012 03:55:31 AM Eduardo Otubo wrote:
> >>>> This patch includes a second whitelist right before the main loop. It's
> >>>> a smaller and more restricted whitelist, excluding execve() among many
> >>>> others.
> >>>> 
> >>>> v2: * ctx changed to main_loop_ctx
> >>>> 
> >>>>       * seccomp_on now inside ifdef
> >>>>       * open syscall added to the main_loop whitelist
> >>>> 
> >>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> >>> 
> >>> Unfortunately qemu.org seems to be down for me today so I can't grab the
> >>> latest repo to review/verify this patch (some of my comments/assumptions
> >>> below may be off) but I'm a little confused, hopefully you guys can help
> >>> me out, read below ...
> >>> 
> >>> The first call to seccomp_install_filter() will setup a whitelist for
> >>> the
> >>> syscalls that have been explicitly specified, all others will hit the
> >>> default action TRAP/KILL.  The second call to seccomp_install_filter()
> >>> will add a second whitelist for another set of explicitly specified
> >>> syscalls, all others will hit the default action TRAP/KILL.
> >> 
> >> That's correct.  The goal was to have a 2nd list that is a subset of the
> >> 1st list, and also not include execve() in the 2nd list.  At this point
> >> though, since it's late in the release, we've expanded the 2nd list to
> >> be the same as the 1st with the exception of execve() not being in the
> >> 2nd list.
> >> 
> >>> The problem occurs when the filters are executed in the kernel when a
> >>> syscall is executed.  On each syscall the first filter will be executed
> >>> and the action will either be ALLOW or TRAP/KILL, next the second filter
> >>> will be executed and the action will either be ALLOW or TRAP/KILL; since
> >>> the kernel always takes the most restrictive (lowest integer action
> >>> value) action when multiple filters are specified, I think your double
> >>> whitelist value is going to have some inherent problems.
> >> 
> >> That's something I hadn't thought of.  But TRAP and KILL won't exist
> >> together in our whitelists, and our 2nd whitelist is a subset of the
> >> 1st.  So do you think there would still be problems?
> > 
> > It doesn't really matter if the default action is TRAP and/or KILL, the
> > point is that if you use a second whitelist after an initial whitelist
> > the effective seccomp filter is going to be only the syscalls you
> > explicitly allowed in the second whitelist.  When using multiple seccomp
> > filters on a process, all filters are executed for each syscall and the
> > most restrictive action of all the filters is the action that the kernel
> > takes.
> > 
> > Don't get me wrong, I like the idea of progressively restricting QEMU, but
> > if you are going to load multiple seccomp filters into the kernel, you
> > almost certainly only want the first whitelist filter to be the union of
> > all the seccomp filter you intend to load with all subsequent filters
> > being blacklists which progressively remove syscalls which are allowed by
> > the initial whitelist.
> 
> That's what we're doing though.  The first whitelist is a union of all
> subsequent filters.  Of course there's only one subsequent filter at
> this point.  But the idea is to start out with a large whitelist for
> initialization and then tighten it up before the main loop when
> presumably less syscalls are needed.

Okay, that's good ... It still seems a bit odd to me, I think a whitelist 1st 
blacklist 2nd is a more intuitive and efficient solution but that may just be 
me.

> My concern is getting the two whitelists correct.  We keep uncovering
> new syscalls as we test.

Of course, this whole whitelist/blacklist discussion assumes the list of 
allowed syscalls is correct.

> >>> I might suggest an initial, fairly permissive
> >>> whitelist followed by a follow-on blacklist if you want to disable
> >>> certain
> >>> syscalls.
> >> 
> >> I have to admit I'm nervous about this at this point in QEMU 1.3.  It's
> >> getting late in the cycle and we'd hoped to get this in earlier.  A more
> >> permissive whitelist is probably going to be the only way we'll
> >> successfully turn -sandbox on by default at this point in QEMU 1.3.
> > 
> > Thats fine, I just wanted to point out that I think the multiple whitelist
> > approach is going to have some inherent problems.
> 
> Are you thinking there will be problems with the current two-whitelist
> approach, or are you thinking there would be problems in the future if
> we continued restricting the QEMU process with further whitelists?  If
> you mean the latter, then I understand your point since QEMU is a single
> process that requires a certain subset of syscalls.

I was originally concerned that you were structuring the whitelists 
incorrectly, but it sounds like that is not the case - that's good.

I'm still concerned that the double whitelist approach may result in bigger 
syscall filters than necessary but until we get a final-ish list there is no 
point worrying about that.

> I'm thinking once the two whitelists are in place, we can move on to
> restricting syscall parameters in the existing whitelists where it makes
> sense ...

Yep, sounds reasonable.

> and then look into your original decomposition approach, where
> parts of qemu are run in separate threads/processes which would allow
> much tighter seccomp restriction.

Ultimately I think this is the right solution if we want to get serious about 
making QEMU more resistant to attacks from malicious guests.
Corey Bryant - Nov. 5, 2012, 10:26 p.m.
On 11/05/2012 04:58 PM, Paul Moore wrote:
> On Monday, November 05, 2012 09:39:46 AM Corey Bryant wrote:
>> On 11/02/2012 06:14 PM, Paul Moore wrote:
>>> On Friday, November 02, 2012 06:00:29 PM Corey Bryant wrote:
>>>> On 11/02/2012 05:29 PM, Paul Moore wrote:
>>>>> On Tuesday, October 23, 2012 03:55:31 AM Eduardo Otubo wrote:
>>>>>> This patch includes a second whitelist right before the main loop. It's
>>>>>> a smaller and more restricted whitelist, excluding execve() among many
>>>>>> others.
>>>>>>
>>>>>> v2: * ctx changed to main_loop_ctx
>>>>>>
>>>>>>        * seccomp_on now inside ifdef
>>>>>>        * open syscall added to the main_loop whitelist
>>>>>>
>>>>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
>>>>>
>>>>> Unfortunately qemu.org seems to be down for me today so I can't grab the
>>>>> latest repo to review/verify this patch (some of my comments/assumptions
>>>>> below may be off) but I'm a little confused, hopefully you guys can help
>>>>> me out, read below ...
>>>>>
>>>>> The first call to seccomp_install_filter() will setup a whitelist for
>>>>> the
>>>>> syscalls that have been explicitly specified, all others will hit the
>>>>> default action TRAP/KILL.  The second call to seccomp_install_filter()
>>>>> will add a second whitelist for another set of explicitly specified
>>>>> syscalls, all others will hit the default action TRAP/KILL.
>>>>
>>>> That's correct.  The goal was to have a 2nd list that is a subset of the
>>>> 1st list, and also not include execve() in the 2nd list.  At this point
>>>> though, since it's late in the release, we've expanded the 2nd list to
>>>> be the same as the 1st with the exception of execve() not being in the
>>>> 2nd list.
>>>>
>>>>> The problem occurs when the filters are executed in the kernel when a
>>>>> syscall is executed.  On each syscall the first filter will be executed
>>>>> and the action will either be ALLOW or TRAP/KILL, next the second filter
>>>>> will be executed and the action will either be ALLOW or TRAP/KILL; since
>>>>> the kernel always takes the most restrictive (lowest integer action
>>>>> value) action when multiple filters are specified, I think your double
>>>>> whitelist value is going to have some inherent problems.
>>>>
>>>> That's something I hadn't thought of.  But TRAP and KILL won't exist
>>>> together in our whitelists, and our 2nd whitelist is a subset of the
>>>> 1st.  So do you think there would still be problems?
>>>
>>> It doesn't really matter if the default action is TRAP and/or KILL, the
>>> point is that if you use a second whitelist after an initial whitelist
>>> the effective seccomp filter is going to be only the syscalls you
>>> explicitly allowed in the second whitelist.  When using multiple seccomp
>>> filters on a process, all filters are executed for each syscall and the
>>> most restrictive action of all the filters is the action that the kernel
>>> takes.
>>>
>>> Don't get me wrong, I like the idea of progressively restricting QEMU, but
>>> if you are going to load multiple seccomp filters into the kernel, you
>>> almost certainly only want the first whitelist filter to be the union of
>>> all the seccomp filter you intend to load with all subsequent filters
>>> being blacklists which progressively remove syscalls which are allowed by
>>> the initial whitelist.
>>
>> That's what we're doing though.  The first whitelist is a union of all
>> subsequent filters.  Of course there's only one subsequent filter at
>> this point.  But the idea is to start out with a large whitelist for
>> initialization and then tighten it up before the main loop when
>> presumably less syscalls are needed.
>
> Okay, that's good ... It still seems a bit odd to me, I think a whitelist 1st
> blacklist 2nd is a more intuitive and efficient solution but that may just be
> me.
>

I missed the blacklist point on this before.  Yes, that makes more sense 
2nd list.  We'll try that out.

Patch

diff --git a/qemu-seccomp.c b/qemu-seccomp.c
index a7b33e2..033cfad 100644
--- a/qemu-seccomp.c
+++ b/qemu-seccomp.c
@@ -13,6 +13,7 @@ 
  * GNU GPL, version 2 or (at your option) any later version.
  */
 #include <stdio.h>
+#include <stdlib.h>
 #include <seccomp.h>
 #include "qemu-seccomp.h"
 
@@ -21,7 +22,7 @@  struct QemuSeccompSyscall {
     uint8_t priority;
 };
 
-static const struct QemuSeccompSyscall seccomp_whitelist[] = {
+static const struct QemuSeccompSyscall seccomp_whitelist_init[] = {
     { SCMP_SYS(timer_settime), 255 },
     { SCMP_SYS(timer_gettime), 254 },
     { SCMP_SYS(futex), 253 },
@@ -121,27 +122,107 @@  static const struct QemuSeccompSyscall seccomp_whitelist[] = {
     { SCMP_SYS(rt_sigtimedwait), 242 }
 };
 
-int seccomp_start(void)
+static const struct QemuSeccompSyscall seccomp_whitelist_main_loop[] = {
+    { SCMP_SYS(timer_settime), 255 },
+    { SCMP_SYS(timer_gettime), 254 },
+    { SCMP_SYS(futex), 253 },
+    { SCMP_SYS(select), 252 },
+    { SCMP_SYS(recvfrom), 251 },
+    { SCMP_SYS(sendto), 250 },
+    { SCMP_SYS(read), 249 },
+    { SCMP_SYS(brk), 248 },
+    { SCMP_SYS(mmap), 247 },
+    { SCMP_SYS(open), 247 },
+#if defined(__i386__)
+    { SCMP_SYS(fcntl64), 245 },
+    { SCMP_SYS(fstat64), 245 },
+    { SCMP_SYS(stat64), 245 },
+    { SCMP_SYS(getgid32), 245 },
+    { SCMP_SYS(getegid32), 245 },
+    { SCMP_SYS(getuid32), 245 },
+    { SCMP_SYS(geteuid32), 245 },
+    { SCMP_SYS(sigreturn), 245 },
+    { SCMP_SYS(_newselect), 245 },
+    { SCMP_SYS(_llseek), 245 },
+    { SCMP_SYS(mmap2), 245},
+    { SCMP_SYS(sigprocmask), 245 },
+#endif
+    { SCMP_SYS(exit), 245 },
+    { SCMP_SYS(timer_delete), 245 },
+    { SCMP_SYS(exit_group), 245 },
+    { SCMP_SYS(rt_sigreturn), 245 },
+    { SCMP_SYS(madvise), 245 },
+    { SCMP_SYS(write), 244 },
+    { SCMP_SYS(fcntl), 243 },
+    { SCMP_SYS(tgkill), 242 },
+    { SCMP_SYS(rt_sigaction), 242 },
+    { SCMP_SYS(pipe2), 242 },
+    { SCMP_SYS(munmap), 242 },
+    { SCMP_SYS(mremap), 242 },
+    { SCMP_SYS(getsockname), 242 },
+    { SCMP_SYS(getpeername), 242 },
+    { SCMP_SYS(close), 242 },
+    { SCMP_SYS(accept4), 242 },
+    { SCMP_SYS(eventfd2), 242 },
+    { SCMP_SYS(recvmsg), 242 },
+    { SCMP_SYS(ioctl), 242 },
+    { SCMP_SYS(rt_sigprocmask), 242 }
+};
+
+static int
+process_whitelist(const struct QemuSeccompSyscall *whitelist,
+                  unsigned int size, scmp_filter_ctx *ctx)
 {
     int rc = 0;
+
     unsigned int i = 0;
-    scmp_filter_ctx ctx;
+
+    for (i = 0; i < size; i++) {
+        rc = seccomp_rule_add(ctx, SCMP_ACT_ALLOW, whitelist[i].num, 0);
+        if (rc < 0) {
+            return -1;
+        }
+
+        rc = seccomp_syscall_priority(ctx, whitelist[i].num,
+                                      whitelist[i].priority);
+        if (rc < 0) {
+            return -1;
+        }
+    }
+    return 0;
+}
+
+int
+seccomp_start(enum whitelist_mode mode, scmp_filter_ctx *ctx)
+{
+    int rc = 0;
 
     ctx = seccomp_init(SCMP_ACT_KILL);
     if (ctx == NULL) {
+        rc = -1;
         goto seccomp_return;
     }
 
-    for (i = 0; i < ARRAY_SIZE(seccomp_whitelist); i++) {
-        rc = seccomp_rule_add(ctx, SCMP_ACT_ALLOW, seccomp_whitelist[i].num, 0);
-        if (rc < 0) {
+    switch (mode) {
+    case INIT:
+        if (process_whitelist
+            (seccomp_whitelist_init,
+             ARRAY_SIZE(seccomp_whitelist_init), ctx) < 0) {
+            rc = -1;
             goto seccomp_return;
         }
-        rc = seccomp_syscall_priority(ctx, seccomp_whitelist[i].num,
-                                      seccomp_whitelist[i].priority);
-        if (rc < 0) {
+        break;
+    case MAIN_LOOP:
+        if (process_whitelist
+            (seccomp_whitelist_main_loop,
+             ARRAY_SIZE(seccomp_whitelist_main_loop), ctx) < 0) {
+            rc = -1;
             goto seccomp_return;
         }
+        break;
+    default:
+        rc = -1;
+        goto seccomp_return;
     }
 
     rc = seccomp_load(ctx);
diff --git a/qemu-seccomp.h b/qemu-seccomp.h
index b2fc3f8..1c97978 100644
--- a/qemu-seccomp.h
+++ b/qemu-seccomp.h
@@ -18,5 +18,10 @@ 
 #include <seccomp.h>
 #include "osdep.h"
 
-int seccomp_start(void);
+enum whitelist_mode {
+    INIT = 0,
+    MAIN_LOOP = 1,
+};
+
+int seccomp_start(enum whitelist_mode mode, scmp_filter_ctx *ctx);
 #endif
diff --git a/vl.c b/vl.c
index bec68cd..d50018f 100644
--- a/vl.c
+++ b/vl.c
@@ -774,10 +774,11 @@  static int bt_parse(const char *opt)
     return 1;
 }
 
-static int install_seccomp_filters(void)
+static int
+install_seccomp_filters(enum whitelist_mode mode, scmp_filter_ctx *ctx)
 {
 #ifdef CONFIG_SECCOMP
-    if (seccomp_start() < 0) {
+    if (seccomp_start(mode, ctx) < 0) {
         qerror_report(ERROR_CLASS_GENERIC_ERROR,
                 "failed to install seccomp syscall filter in the kernel");
         return -1;
@@ -2407,6 +2408,10 @@  int main(int argc, char **argv, char **envp)
     const char *trace_events = NULL;
     const char *trace_file = NULL;
 
+#ifdef CONFIG_SECCOMP
+    scmp_filter_ctx main_loop_ctx;
+#endif
+
     atexit(qemu_run_exit_notifiers);
     error_set_progname(argv[0]);
 
@@ -3330,11 +3335,13 @@  int main(int argc, char **argv, char **envp)
     }
 
     /* We should install seccomp filters even if -sandbox on is not used. */
+#ifdef CONFIG_SECCOMP
     if (seccomp_on) {
-        if (install_seccomp_filters() < 0) {
+        if (install_seccomp_filters(INIT, &main_loop_ctx) < 0) {
             exit(1);
         }
     }
+#endif
 
     if (machine == NULL) {
         fprintf(stderr, "No machine found.\n");
@@ -3794,6 +3801,14 @@  int main(int argc, char **argv, char **envp)
 
     os_setup_post();
 
+#ifdef CONFIG_SECCOMP
+    if (seccomp_on) {
+        if (install_seccomp_filters(MAIN_LOOP, &main_loop_ctx) < 0) {
+            exit(1);
+        }
+    }
+#endif
+
     resume_all_vcpus();
     main_loop();
     bdrv_close_all();