diff mbox

[for-1.7] seccomp: setting "-sandbox on" by default

Message ID 1382440906-3852-1-git-send-email-otubo@linux.vnet.ibm.com
State New
Headers show

Commit Message

Eduardo Otubo Oct. 22, 2013, 11:21 a.m. UTC
Inverting the way sandbox handles arguments, making possible to have no
argument and still have '-sandbox on' enabled.

Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
---

The option '-sandbox on' is now used by default by virt-test[0] -- it has been
merged into the 'next' branch and will be available in the next release,
meaning we have a back support for regression tests if anything breaks because
of some missing system call not listed in the whitelist.

This being said, I think it makes sense to have this option set to 'on' by
default in the next Qemu version. It's been a while since no missing syscall is
reported and at this point the whitelist seems to be pretty mature.

[0] - https://github.com/autotest/virt-test/commit/50e1f7d47a94f4c770880cd8ec0f18365dcba714

 qemu-options.hx |  4 ++--
 vl.c            | 47 ++++++++++++++++++++++++++++-------------------
 2 files changed, 30 insertions(+), 21 deletions(-)

Comments

Anthony Liguori Oct. 22, 2013, 1 p.m. UTC | #1
On Tue, Oct 22, 2013 at 12:21 PM, Eduardo Otubo
<otubo@linux.vnet.ibm.com> wrote:
> Inverting the way sandbox handles arguments, making possible to have no
> argument and still have '-sandbox on' enabled.
>
> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> ---
>
> The option '-sandbox on' is now used by default by virt-test[0] -- it has been
> merged into the 'next' branch and will be available in the next release,
> meaning we have a back support for regression tests if anything breaks because
> of some missing system call not listed in the whitelist.
>
> This being said, I think it makes sense to have this option set to 'on' by
> default in the next Qemu version. It's been a while since no missing syscall is
> reported and at this point the whitelist seems to be pretty mature.
>
> [0] - https://github.com/autotest/virt-test/commit/50e1f7d47a94f4c770880cd8ec0f18365dcba714

This breaks hot_add of a network device that uses a script= argument, correct?

If so, this cannot be made default.

Regards,

Anthony Liguori

>
>  qemu-options.hx |  4 ++--
>  vl.c            | 47 ++++++++++++++++++++++++++++-------------------
>  2 files changed, 30 insertions(+), 21 deletions(-)
>
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 5dc8b75..315a86d 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -2982,13 +2982,13 @@ Old param mode (ARM only).
>  ETEXI
>
>  DEF("sandbox", HAS_ARG, QEMU_OPTION_sandbox, \
> -    "-sandbox <arg>  Enable seccomp mode 2 system call filter (default 'off').\n",
> +    "-sandbox <arg>  Enable seccomp mode 2 system call filter (default 'on').\n",
>      QEMU_ARCH_ALL)
>  STEXI
>  @item -sandbox @var{arg}
>  @findex -sandbox
>  Enable Seccomp mode 2 system call filter. 'on' will enable syscall filtering and 'off' will
> -disable it.  The default is 'off'.
> +disable it.  The default is 'on'.
>  ETEXI
>
>  DEF("readconfig", HAS_ARG, QEMU_OPTION_readconfig,
> diff --git a/vl.c b/vl.c
> index b42ac67..ae3bdc9 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -529,6 +529,20 @@ static QemuOptsList qemu_msg_opts = {
>      },
>  };
>
> +static QemuOpts *qemu_get_sandbox_opts(void)
> +{
> +    QemuOptsList *list;
> +    QemuOpts *opts;
> +
> +    list = qemu_find_opts("sandbox");
> +    assert(list);
> +    opts = qemu_opts_find(list, NULL);
> +    if (!opts) {
> +        opts = qemu_opts_create_nofail(list);
> +    }
> +    return opts;
> +}
> +
>  /**
>   * Get machine options
>   *
> @@ -960,24 +974,9 @@ static int bt_parse(const char *opt)
>      return 1;
>  }
>
> -static int parse_sandbox(QemuOpts *opts, void *opaque)
> +static bool sandbox_enabled(bool default_usb)
>  {
> -    /* FIXME: change this to true for 1.3 */
> -    if (qemu_opt_get_bool(opts, "enable", false)) {
> -#ifdef CONFIG_SECCOMP
> -        if (seccomp_start() < 0) {
> -            qerror_report(ERROR_CLASS_GENERIC_ERROR,
> -                          "failed to install seccomp syscall filter in the kernel");
> -            return -1;
> -        }
> -#else
> -        qerror_report(ERROR_CLASS_GENERIC_ERROR,
> -                      "sandboxing request but seccomp is not compiled into this build");
> -        return -1;
> -#endif
> -    }
> -
> -    return 0;
> +    return qemu_opt_get_bool(qemu_get_sandbox_opts(), "sandbox", default_usb);
>  }
>
>  bool usb_enabled(bool default_usb)
> @@ -3806,8 +3805,18 @@ int main(int argc, char **argv, char **envp)
>          exit(1);
>      }
>
> -    if (qemu_opts_foreach(qemu_find_opts("sandbox"), parse_sandbox, NULL, 0)) {
> -        exit(1);
> +    if (sandbox_enabled(true)) {
> +#ifdef CONFIG_SECCOMP
> +        if (seccomp_start() < 0) {
> +            qerror_report(ERROR_CLASS_GENERIC_ERROR,
> +                          "failed to install seccomp syscall filter in the kernel");
> +            return -1;
> +        }
> +#else
> +        qerror_report(ERROR_CLASS_GENERIC_ERROR,
> +                      "sandboxing request but seccomp is not compiled into this build");
> +        return -1;
> +#endif
>      }
>
>  #ifndef _WIN32
> --
> 1.8.3.1
>
Eduardo Otubo Oct. 23, 2013, 2:42 p.m. UTC | #2
On 10/22/2013 11:00 AM, Anthony Liguori wrote:
> On Tue, Oct 22, 2013 at 12:21 PM, Eduardo Otubo
> <otubo@linux.vnet.ibm.com> wrote:
>> Inverting the way sandbox handles arguments, making possible to have no
>> argument and still have '-sandbox on' enabled.
>>
>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
>> ---
>>
>> The option '-sandbox on' is now used by default by virt-test[0] -- it has been
>> merged into the 'next' branch and will be available in the next release,
>> meaning we have a back support for regression tests if anything breaks because
>> of some missing system call not listed in the whitelist.
>>
>> This being said, I think it makes sense to have this option set to 'on' by
>> default in the next Qemu version. It's been a while since no missing syscall is
>> reported and at this point the whitelist seems to be pretty mature.
>>
>> [0] - https://github.com/autotest/virt-test/commit/50e1f7d47a94f4c770880cd8ec0f18365dcba714
>
> This breaks hot_add of a network device that uses a script= argument, correct?
>
> If so, this cannot be made default.

Anthony, I believe you're talking about the blacklist feature. This is 
the old whitelist that is already upstream and it does not block any 
network device to be hot plugged.

>
> Regards,
>
> Anthony Liguori
>
>>
>>   qemu-options.hx |  4 ++--
>>   vl.c            | 47 ++++++++++++++++++++++++++++-------------------
>>   2 files changed, 30 insertions(+), 21 deletions(-)
>>
>> diff --git a/qemu-options.hx b/qemu-options.hx
>> index 5dc8b75..315a86d 100644
>> --- a/qemu-options.hx
>> +++ b/qemu-options.hx
>> @@ -2982,13 +2982,13 @@ Old param mode (ARM only).
>>   ETEXI
>>
>>   DEF("sandbox", HAS_ARG, QEMU_OPTION_sandbox, \
>> -    "-sandbox <arg>  Enable seccomp mode 2 system call filter (default 'off').\n",
>> +    "-sandbox <arg>  Enable seccomp mode 2 system call filter (default 'on').\n",
>>       QEMU_ARCH_ALL)
>>   STEXI
>>   @item -sandbox @var{arg}
>>   @findex -sandbox
>>   Enable Seccomp mode 2 system call filter. 'on' will enable syscall filtering and 'off' will
>> -disable it.  The default is 'off'.
>> +disable it.  The default is 'on'.
>>   ETEXI
>>
>>   DEF("readconfig", HAS_ARG, QEMU_OPTION_readconfig,
>> diff --git a/vl.c b/vl.c
>> index b42ac67..ae3bdc9 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -529,6 +529,20 @@ static QemuOptsList qemu_msg_opts = {
>>       },
>>   };
>>
>> +static QemuOpts *qemu_get_sandbox_opts(void)
>> +{
>> +    QemuOptsList *list;
>> +    QemuOpts *opts;
>> +
>> +    list = qemu_find_opts("sandbox");
>> +    assert(list);
>> +    opts = qemu_opts_find(list, NULL);
>> +    if (!opts) {
>> +        opts = qemu_opts_create_nofail(list);
>> +    }
>> +    return opts;
>> +}
>> +
>>   /**
>>    * Get machine options
>>    *
>> @@ -960,24 +974,9 @@ static int bt_parse(const char *opt)
>>       return 1;
>>   }
>>
>> -static int parse_sandbox(QemuOpts *opts, void *opaque)
>> +static bool sandbox_enabled(bool default_usb)
>>   {
>> -    /* FIXME: change this to true for 1.3 */
>> -    if (qemu_opt_get_bool(opts, "enable", false)) {
>> -#ifdef CONFIG_SECCOMP
>> -        if (seccomp_start() < 0) {
>> -            qerror_report(ERROR_CLASS_GENERIC_ERROR,
>> -                          "failed to install seccomp syscall filter in the kernel");
>> -            return -1;
>> -        }
>> -#else
>> -        qerror_report(ERROR_CLASS_GENERIC_ERROR,
>> -                      "sandboxing request but seccomp is not compiled into this build");
>> -        return -1;
>> -#endif
>> -    }
>> -
>> -    return 0;
>> +    return qemu_opt_get_bool(qemu_get_sandbox_opts(), "sandbox", default_usb);
>>   }
>>
>>   bool usb_enabled(bool default_usb)
>> @@ -3806,8 +3805,18 @@ int main(int argc, char **argv, char **envp)
>>           exit(1);
>>       }
>>
>> -    if (qemu_opts_foreach(qemu_find_opts("sandbox"), parse_sandbox, NULL, 0)) {
>> -        exit(1);
>> +    if (sandbox_enabled(true)) {
>> +#ifdef CONFIG_SECCOMP
>> +        if (seccomp_start() < 0) {
>> +            qerror_report(ERROR_CLASS_GENERIC_ERROR,
>> +                          "failed to install seccomp syscall filter in the kernel");
>> +            return -1;
>> +        }
>> +#else
>> +        qerror_report(ERROR_CLASS_GENERIC_ERROR,
>> +                      "sandboxing request but seccomp is not compiled into this build");
>> +        return -1;
>> +#endif
>>       }
>>
>>   #ifndef _WIN32
>> --
>> 1.8.3.1
>>
>
Stefan Hajnoczi Oct. 30, 2013, 10:04 a.m. UTC | #3
On Wed, Oct 23, 2013 at 12:42:34PM -0200, Eduardo Otubo wrote:
> 
> 
> On 10/22/2013 11:00 AM, Anthony Liguori wrote:
> >On Tue, Oct 22, 2013 at 12:21 PM, Eduardo Otubo
> ><otubo@linux.vnet.ibm.com> wrote:
> >>Inverting the way sandbox handles arguments, making possible to have no
> >>argument and still have '-sandbox on' enabled.
> >>
> >>Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> >>---
> >>
> >>The option '-sandbox on' is now used by default by virt-test[0] -- it has been
> >>merged into the 'next' branch and will be available in the next release,
> >>meaning we have a back support for regression tests if anything breaks because
> >>of some missing system call not listed in the whitelist.
> >>
> >>This being said, I think it makes sense to have this option set to 'on' by
> >>default in the next Qemu version. It's been a while since no missing syscall is
> >>reported and at this point the whitelist seems to be pretty mature.
> >>
> >>[0] - https://github.com/autotest/virt-test/commit/50e1f7d47a94f4c770880cd8ec0f18365dcba714
> >
> >This breaks hot_add of a network device that uses a script= argument, correct?
> >
> >If so, this cannot be made default.
> 
> Anthony, I believe you're talking about the blacklist feature. This
> is the old whitelist that is already upstream and it does not block
> any network device to be hot plugged.

The following fails to start here (the shell hangs and ps shows QEMU is
a <defunct> process):

qemu-system-x86_64 -sandbox on -enable-kvm -m 1024 -cpu host \
                   -drive if=virtio,cache=none,file=test.img

It is using the GTK UI.

Stefan
Paolo Bonzini Nov. 21, 2013, 3:14 p.m. UTC | #4
Il 30/10/2013 11:04, Stefan Hajnoczi ha scritto:
> On Wed, Oct 23, 2013 at 12:42:34PM -0200, Eduardo Otubo wrote:
>>
>>
>> On 10/22/2013 11:00 AM, Anthony Liguori wrote:
>>> On Tue, Oct 22, 2013 at 12:21 PM, Eduardo Otubo
>>> <otubo@linux.vnet.ibm.com> wrote:
>>>> Inverting the way sandbox handles arguments, making possible to have no
>>>> argument and still have '-sandbox on' enabled.
>>>>
>>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
>>>> ---
>>>>
>>>> The option '-sandbox on' is now used by default by virt-test[0] -- it has been
>>>> merged into the 'next' branch and will be available in the next release,
>>>> meaning we have a back support for regression tests if anything breaks because
>>>> of some missing system call not listed in the whitelist.
>>>>
>>>> This being said, I think it makes sense to have this option set to 'on' by
>>>> default in the next Qemu version. It's been a while since no missing syscall is
>>>> reported and at this point the whitelist seems to be pretty mature.
>>>>
>>>> [0] - https://github.com/autotest/virt-test/commit/50e1f7d47a94f4c770880cd8ec0f18365dcba714
>>>
>>> This breaks hot_add of a network device that uses a script= argument, correct?
>>>
>>> If so, this cannot be made default.
>>
>> Anthony, I believe you're talking about the blacklist feature. This
>> is the old whitelist that is already upstream and it does not block
>> any network device to be hot plugged.
> 
> The following fails to start here (the shell hangs and ps shows QEMU is
> a <defunct> process):
> 
> qemu-system-x86_64 -sandbox on -enable-kvm -m 1024 -cpu host \
>                    -drive if=virtio,cache=none,file=test.img

Easier-to-debug failures are another prerequisite for enabling the
sandbox by default, I think.

Paolo
Paul Moore Nov. 21, 2013, 3:48 p.m. UTC | #5
On Thursday, November 21, 2013 04:14:11 PM Paolo Bonzini wrote:
> Il 30/10/2013 11:04, Stefan Hajnoczi ha scritto:
> > On Wed, Oct 23, 2013 at 12:42:34PM -0200, Eduardo Otubo wrote:
> >> On 10/22/2013 11:00 AM, Anthony Liguori wrote:
> >>> On Tue, Oct 22, 2013 at 12:21 PM, Eduardo Otubo
> >>> 
> >>> <otubo@linux.vnet.ibm.com> wrote:
> >>>> Inverting the way sandbox handles arguments, making possible to have no
> >>>> argument and still have '-sandbox on' enabled.
> >>>> 
> >>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> >>>> ---
> >>>> 
> >>>> The option '-sandbox on' is now used by default by virt-test[0] -- it
> >>>> has been merged into the 'next' branch and will be available in the
> >>>> next release, meaning we have a back support for regression tests if
> >>>> anything breaks because of some missing system call not listed in the
> >>>> whitelist.
> >>>> 
> >>>> This being said, I think it makes sense to have this option set to 'on'
> >>>> by
> >>>> default in the next Qemu version. It's been a while since no missing
> >>>> syscall is reported and at this point the whitelist seems to be pretty
> >>>> mature.
> >>>> 
> >>>> [0] -
> >>>> https://github.com/autotest/virt-test/commit/50e1f7d47a94f4c770880cd8e
> >>>> c0f18365dcba714>>> 
> >>> This breaks hot_add of a network device that uses a script= argument,
> >>> correct?
> >>> 
> >>> If so, this cannot be made default.
> >> 
> >> Anthony, I believe you're talking about the blacklist feature. This
> >> is the old whitelist that is already upstream and it does not block
> >> any network device to be hot plugged.
> > 
> > The following fails to start here (the shell hangs and ps shows QEMU is
> > a <defunct> process):
> > 
> > qemu-system-x86_64 -sandbox on -enable-kvm -m 1024 -cpu host \
> > 
> >                    -drive if=virtio,cache=none,file=test.img
> 
> Easier-to-debug failures are another prerequisite for enabling the
> sandbox by default, I think.

I believe I've posted this information before, but just in case ...

IMHO, it is really not that hard to debug a seccomp failure; the first step is 
to look for the failure in the audit log or syslog.  If you are on a 
Fedora/RHEL based system you are most likely running audit, so finding the 
seccomp failures are quite simple with the 'ausearch' command:

 # ausearch -m SECCOMP
 ----
 time->Wed Nov 20 09:52:08 2013
 type=SECCOMP msg=audit(1384912328.482:6656): auid=0 uid=0 gid=0 ses=854
  subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 pid=12087
  comm="qemu-kvm" sig=31 syscall=62 compat=0 ip=0x7f7a1d2abc67 code=0x0

... if you are using syslog, feel free to use whatever tool you prefer, e.g. 
grep, less, etc.

Once you have the syscall number, "syscall=62", in the audit message above, 
you can use the 'scmp_sys_resolver' to resolve the number into a name:

 # scmp_sys_resolver 62
 kill

The 'scmp_sys_resolver' tool is part of the libseccomp-devel package on 
Fedora/RHEL based systems, it may be packaged differently on other 
distributions.

I'm always open to suggestions on how to improve the development/debugging 
process, so if you have any ideas please let me know.

-Paul
Eduardo Otubo Nov. 21, 2013, 4:22 p.m. UTC | #6
On 11/21/2013 01:48 PM, Paul Moore wrote:
> On Thursday, November 21, 2013 04:14:11 PM Paolo Bonzini wrote:
>> Il 30/10/2013 11:04, Stefan Hajnoczi ha scritto:
>>> On Wed, Oct 23, 2013 at 12:42:34PM -0200, Eduardo Otubo wrote:
>>>> On 10/22/2013 11:00 AM, Anthony Liguori wrote:
>>>>> On Tue, Oct 22, 2013 at 12:21 PM, Eduardo Otubo
>>>>>
>>>>> <otubo@linux.vnet.ibm.com> wrote:
>>>>>> Inverting the way sandbox handles arguments, making possible to have no
>>>>>> argument and still have '-sandbox on' enabled.
>>>>>>
>>>>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
>>>>>> ---
>>>>>>
>>>>>> The option '-sandbox on' is now used by default by virt-test[0] -- it
>>>>>> has been merged into the 'next' branch and will be available in the
>>>>>> next release, meaning we have a back support for regression tests if
>>>>>> anything breaks because of some missing system call not listed in the
>>>>>> whitelist.
>>>>>>
>>>>>> This being said, I think it makes sense to have this option set to 'on'
>>>>>> by
>>>>>> default in the next Qemu version. It's been a while since no missing
>>>>>> syscall is reported and at this point the whitelist seems to be pretty
>>>>>> mature.
>>>>>>
>>>>>> [0] -
>>>>>> https://github.com/autotest/virt-test/commit/50e1f7d47a94f4c770880cd8e
>>>>>> c0f18365dcba714>>>
>>>>> This breaks hot_add of a network device that uses a script= argument,
>>>>> correct?
>>>>>
>>>>> If so, this cannot be made default.
>>>>
>>>> Anthony, I believe you're talking about the blacklist feature. This
>>>> is the old whitelist that is already upstream and it does not block
>>>> any network device to be hot plugged.
>>>
>>> The following fails to start here (the shell hangs and ps shows QEMU is
>>> a <defunct> process):
>>>
>>> qemu-system-x86_64 -sandbox on -enable-kvm -m 1024 -cpu host \
>>>
>>>                     -drive if=virtio,cache=none,file=test.img
>>
>> Easier-to-debug failures are another prerequisite for enabling the
>> sandbox by default, I think.

I've already sent a patch for a "debug mode" in the past. It was denied 
because of two main points: (1) Anthony was looking for a more solid and 
closed solution for sandboxing. A "debug" or "learning" mode would be 
too much exposure of the attack surface. (2) Debug mode was changing the 
Qemu's sig mask in a way that it was breaking it. And besides, at that 
time, there were too many linked libraries that would interfere on this 
sig handling, so we gave up on this feature.

>
> I believe I've posted this information before, but just in case ...
>
> IMHO, it is really not that hard to debug a seccomp failure; the first step is
> to look for the failure in the audit log or syslog.  If you are on a
> Fedora/RHEL based system you are most likely running audit, so finding the
> seccomp failures are quite simple with the 'ausearch' command:
>
>   # ausearch -m SECCOMP
>   ----
>   time->Wed Nov 20 09:52:08 2013
>   type=SECCOMP msg=audit(1384912328.482:6656): auid=0 uid=0 gid=0 ses=854
>    subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 pid=12087
>    comm="qemu-kvm" sig=31 syscall=62 compat=0 ip=0x7f7a1d2abc67 code=0x0
>
> ... if you are using syslog, feel free to use whatever tool you prefer, e.g.
> grep, less, etc.
>
> Once you have the syscall number, "syscall=62", in the audit message above,
> you can use the 'scmp_sys_resolver' to resolve the number into a name:
>
>   # scmp_sys_resolver 62
>   kill
>
> The 'scmp_sys_resolver' tool is part of the libseccomp-devel package on
> Fedora/RHEL based systems, it may be packaged differently on other
> distributions.
>
> I'm always open to suggestions on how to improve the development/debugging
> process, so if you have any ideas please let me know.

Also, I've been working on some improvements on virt-test side. I'm 
trying to make it report the illegal system call using audit log and 
libseccomp as source of information. This way a simple run could 
identify missing system calls for the whitelist.
Stefan Hajnoczi Nov. 22, 2013, 10:34 a.m. UTC | #7
On Wed, Oct 30, 2013 at 11:04:39AM +0100, Stefan Hajnoczi wrote:
> On Wed, Oct 23, 2013 at 12:42:34PM -0200, Eduardo Otubo wrote:
> > On 10/22/2013 11:00 AM, Anthony Liguori wrote:
> > >On Tue, Oct 22, 2013 at 12:21 PM, Eduardo Otubo
> > ><otubo@linux.vnet.ibm.com> wrote:
> > >>Inverting the way sandbox handles arguments, making possible to have no
> > >>argument and still have '-sandbox on' enabled.
> > >>
> > >>Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> > >>---
> > >>
> > >>The option '-sandbox on' is now used by default by virt-test[0] -- it has been
> > >>merged into the 'next' branch and will be available in the next release,
> > >>meaning we have a back support for regression tests if anything breaks because
> > >>of some missing system call not listed in the whitelist.
> > >>
> > >>This being said, I think it makes sense to have this option set to 'on' by
> > >>default in the next Qemu version. It's been a while since no missing syscall is
> > >>reported and at this point the whitelist seems to be pretty mature.
> > >>
> > >>[0] - https://github.com/autotest/virt-test/commit/50e1f7d47a94f4c770880cd8ec0f18365dcba714
> > >
> > >This breaks hot_add of a network device that uses a script= argument, correct?
> > >
> > >If so, this cannot be made default.
> > 
> > Anthony, I believe you're talking about the blacklist feature. This
> > is the old whitelist that is already upstream and it does not block
> > any network device to be hot plugged.
> 
> The following fails to start here (the shell hangs and ps shows QEMU is
> a <defunct> process):
> 
> qemu-system-x86_64 -sandbox on -enable-kvm -m 1024 -cpu host \
>                    -drive if=virtio,cache=none,file=test.img
> 
> It is using the GTK UI.

IMO this seccomp approach is doomed since QEMU does not practice
privilege separation.  QEMU is monolithic so it's really hard to create
a meaningful sets of system calls.  To avoid breaking stuff you need to
be too liberal, defeating the purpose of seccomp.

For each QEMU command-line there may be a different set of syscalls that
should be allowed/forbidden.

The existing approach clearly doesn't support the full range of options
that users specify on the command-line.  So I guess the options are:

1. Don't make it the default since it breaks stuff but use it for very
specific scenarios (e.g. libvirt use cases that have been well tested).

2. Provide a kind of syscall set for various QEMU options and apply the
union of them at launch.  This still seems fragile but in theory it
could work.

Stefan
Stefan Hajnoczi Nov. 22, 2013, 10:39 a.m. UTC | #8
On Thu, Nov 21, 2013 at 10:48:58AM -0500, Paul Moore wrote:
> On Thursday, November 21, 2013 04:14:11 PM Paolo Bonzini wrote:
> > Il 30/10/2013 11:04, Stefan Hajnoczi ha scritto:
> > > On Wed, Oct 23, 2013 at 12:42:34PM -0200, Eduardo Otubo wrote:
> > >> On 10/22/2013 11:00 AM, Anthony Liguori wrote:
> > >>> On Tue, Oct 22, 2013 at 12:21 PM, Eduardo Otubo
> > >>> 
> > >>> <otubo@linux.vnet.ibm.com> wrote:
> > >>>> Inverting the way sandbox handles arguments, making possible to have no
> > >>>> argument and still have '-sandbox on' enabled.
> > >>>> 
> > >>>> Signed-off-by: Eduardo Otubo <otubo@linux.vnet.ibm.com>
> > >>>> ---
> > >>>> 
> > >>>> The option '-sandbox on' is now used by default by virt-test[0] -- it
> > >>>> has been merged into the 'next' branch and will be available in the
> > >>>> next release, meaning we have a back support for regression tests if
> > >>>> anything breaks because of some missing system call not listed in the
> > >>>> whitelist.
> > >>>> 
> > >>>> This being said, I think it makes sense to have this option set to 'on'
> > >>>> by
> > >>>> default in the next Qemu version. It's been a while since no missing
> > >>>> syscall is reported and at this point the whitelist seems to be pretty
> > >>>> mature.
> > >>>> 
> > >>>> [0] -
> > >>>> https://github.com/autotest/virt-test/commit/50e1f7d47a94f4c770880cd8e
> > >>>> c0f18365dcba714>>> 
> > >>> This breaks hot_add of a network device that uses a script= argument,
> > >>> correct?
> > >>> 
> > >>> If so, this cannot be made default.
> > >> 
> > >> Anthony, I believe you're talking about the blacklist feature. This
> > >> is the old whitelist that is already upstream and it does not block
> > >> any network device to be hot plugged.
> > > 
> > > The following fails to start here (the shell hangs and ps shows QEMU is
> > > a <defunct> process):
> > > 
> > > qemu-system-x86_64 -sandbox on -enable-kvm -m 1024 -cpu host \
> > > 
> > >                    -drive if=virtio,cache=none,file=test.img
> > 
> > Easier-to-debug failures are another prerequisite for enabling the
> > sandbox by default, I think.
[...]
> I'm always open to suggestions on how to improve the development/debugging 
> process, so if you have any ideas please let me know.

The failure mode is terrible:
  "The following fails to start here (the shell hangs and ps shows QEMU
  is a <defunct> process)"

When a process dies the shell prints a warning.  That alerts the user
and prompts them to take further steps.

Hanging the shell is a really bad way to fail.  We can't expect users to
begin searching logs for audit failures.  They probably don't even know
about audit or seccomp.

Is it possible to produce output when a seccomp violation takes place?

Stefan
Paul Moore Nov. 22, 2013, 2:38 p.m. UTC | #9
On Friday, November 22, 2013 11:34:41 AM Stefan Hajnoczi wrote:
> IMO this seccomp approach is doomed since QEMU does not practice
> privilege separation.  QEMU is monolithic so it's really hard to create
> a meaningful sets of system calls.

I'm a big fan of decomposing QEMU, but based on previous discussions there 
seems to be a lot of fear from the core QEMU folks around decomposition; 
enough that I'm not sure it is worth the time and effort at this point to 
pursue it.  While I agree that a decomposed QEMU would be able to make better 
use of syscall filtering (and LSM/SELinux protection, and ...) I don't believe 
it means syscall filtering is a complete lost cause with a monolithic QEMU.  
Any improvement you can make, no matter how small, is still and improvement.

> To avoid breaking stuff you need to be too liberal, defeating the purpose of
> seccomp.

Even if you can only disable a few syscalls you are still better off than you 
were before.  Could it be done better, of course it could, but it doesn't mean 
you shouldn't try for some benefit.

> For each QEMU command-line there may be a different set of syscalls that
> should be allowed/forbidden.

I'm not sure if you missed it or not, but I had an email exchange with Eduardo 
on this list about making the syscall whitelist a bit more "intelligent" and 
dependent on what functionality was enabled for a given QEMU instance.  This 
should help a bit with the problems you are describing.

> The existing approach clearly doesn't support the full range of options
> that users specify on the command-line.

Bugs.  It will get fixed in time with more testing/debugging.  Eduardo is 
working on improving the testing and RH's QA folks are working hard to shake 
out the bugs too.  I just posted another bug fix patch to the whitelist a few 
days ago.

> So I guess the options are:
> 
> 1. Don't make it the default since it breaks stuff but use it for very
> specific scenarios (e.g. libvirt use cases that have been well tested).

In my opinion, I think it was probably a bit premature to make enable it by 
default, but at some point in the future I think we do need to do this.

> 2. Provide a kind of syscall set for various QEMU options and apply the
> union of them at launch.  This still seems fragile but in theory it
> could work.

This is what I was discussing above.  I think this is likely the next big 
improvement.
Paul Moore Nov. 22, 2013, 2:44 p.m. UTC | #10
On Friday, November 22, 2013 11:39:31 AM Stefan Hajnoczi wrote:
> On Thu, Nov 21, 2013 at 10:48:58AM -0500, Paul Moore wrote:
> > I'm always open to suggestions on how to improve the development/debugging
> > process, so if you have any ideas please let me know.
> 
> The failure mode is terrible:

Glad to see you don't feel strongly about things.

>   "The following fails to start here (the shell hangs and ps shows QEMU
>   is a <defunct> process)"
> 
> When a process dies the shell prints a warning.  That alerts the user
> and prompts them to take further steps.
> 
> Hanging the shell is a really bad way to fail.  We can't expect users to
> begin searching logs for audit failures.  They probably don't even know
> about audit or seccomp.

First things first, if a normal user hits a seccomp failure I consider it to 
be a bug, and to be honest, I've seen much nastier, much more subtle bugs than 
an inadvertent seccomp "death".  In a perfect world only developers would run 
into this problem and I would expect developers to be smart enough to figure 
out what is going on.

Getting past that, if seccomp is configured to kill the process when it 
violates the filter rules there isn't much else we can do; the kernel kills 
the process and then the rest of userspace, e.g. the shell/libvirt/etc., does 
whatever it does.  We have very little control over things.

> Is it possible to produce output when a seccomp violation takes place?

See the earlier comments from Eduardo about his attempts in this area.
Stefan Hajnoczi Nov. 22, 2013, 3:48 p.m. UTC | #11
On Fri, Nov 22, 2013 at 09:44:42AM -0500, Paul Moore wrote:
> On Friday, November 22, 2013 11:39:31 AM Stefan Hajnoczi wrote:
> > On Thu, Nov 21, 2013 at 10:48:58AM -0500, Paul Moore wrote:
> > > I'm always open to suggestions on how to improve the development/debugging
> > > process, so if you have any ideas please let me know.
> > 
> > The failure mode is terrible:
> 
> Glad to see you don't feel strongly about things.

Sorry for the rant :).  I know you and Eduardo understand the issues and
have already been working on them.

I hope hearing it from a developer who isn't following seccomp is useful
though.  It shows which issues stick out and hinder usability.  Users
will only be happy with seccomp when it works silently behind the
scenes.  Developers will only be happy with seccomp if it's easy and
rewarding to support/debug.

Stefan
Paul Moore Nov. 22, 2013, 4 p.m. UTC | #12
On Friday, November 22, 2013 04:48:41 PM Stefan Hajnoczi wrote:
> On Fri, Nov 22, 2013 at 09:44:42AM -0500, Paul Moore wrote:
> > On Friday, November 22, 2013 11:39:31 AM Stefan Hajnoczi wrote:
> > > On Thu, Nov 21, 2013 at 10:48:58AM -0500, Paul Moore wrote:
> > > > I'm always open to suggestions on how to improve the
> > > > development/debugging
> > > > process, so if you have any ideas please let me know.
> > > 
> > > The failure mode is terrible:
> > Glad to see you don't feel strongly about things.
> 
> Sorry for the rant :).  I know you and Eduardo understand the issues and
> have already been working on them.

I can't speak for Eduardo, but no worries on my end; it just wouldn't be an 
Open Source project without a bit of hyperbole now and then would it? ;)

> I hope hearing it from a developer who isn't following seccomp is useful
> though.

Definitely.  I should have said it earlier, but I do appreciate you taking the 
time to comment.

> It shows which issues stick out and hinder usability.  Users will only be
> happy with seccomp when it works silently behind the scenes.

Exactly.  Users don't tolerate bugs and I don't blame them.  After all, at 
some point we are all users too.

> Developers will only be happy with seccomp if it's easy and rewarding to
> support/debug.

Agreed.

As a developer, how do you feel about the audit/syslog based approach I 
mentioned earlier?
Stefan Hajnoczi Dec. 4, 2013, 9:39 a.m. UTC | #13
On Fri, Nov 22, 2013 at 11:00:24AM -0500, Paul Moore wrote:
> > Developers will only be happy with seccomp if it's easy and rewarding to
> > support/debug.
> 
> Agreed.
> 
> As a developer, how do you feel about the audit/syslog based approach I 
> mentioned earlier?

I used the commands you posted (I think that's what you mean).  They
produce useful output.

The problem is that without an error message on stderr or from the
shell, no one will think "QEMU process dead and hung == check seccomp"
immediately.  It's frustrating to deal with a "silent" failure.

Stefan
Eduardo Otubo Dec. 4, 2013, 1:17 p.m. UTC | #14
>
>> The existing approach clearly doesn't support the full range of options
>> that users specify on the command-line.
>
> Bugs.  It will get fixed in time with more testing/debugging.  Eduardo is
> working on improving the testing and RH's QA folks are working hard to shake
> out the bugs too.  I just posted another bug fix patch to the whitelist a few
> days ago.

Exactly, I'm working close with virt-test team to improve the testing 
and feedback for possible illegal syscalls on various scenarios.

>
>> So I guess the options are:
>>
>> 1. Don't make it the default since it breaks stuff but use it for very
>> specific scenarios (e.g. libvirt use cases that have been well tested).
>
> In my opinion, I think it was probably a bit premature to make enable it by
> default, but at some point in the future I think we do need to do this.

I have to admit it was a little premature, yes. But I think once we have 
a stable set of tool in virt-test, we can turn it on by default in a 
near future.

>
>> 2. Provide a kind of syscall set for various QEMU options and apply the
>> union of them at launch.  This still seems fragile but in theory it
>> could work.
>
> This is what I was discussing above.  I think this is likely the next big
> improvement.
>

That's the feature I'm currently working on right now. We'll see some 
improvements in the future. :)
Eduardo Otubo Dec. 4, 2013, 1:21 p.m. UTC | #15
On 12/04/2013 07:39 AM, Stefan Hajnoczi wrote:
> On Fri, Nov 22, 2013 at 11:00:24AM -0500, Paul Moore wrote:
>>> Developers will only be happy with seccomp if it's easy and rewarding to
>>> support/debug.
>>
>> Agreed.
>>
>> As a developer, how do you feel about the audit/syslog based approach I
>> mentioned earlier?
>
> I used the commands you posted (I think that's what you mean).  They
> produce useful output.
>
> The problem is that without an error message on stderr or from the
> shell, no one will think "QEMU process dead and hung == check seccomp"
> immediately.  It's frustrating to deal with a "silent" failure.

The process dies with a SIGKILL, and sig handling in Qemu is hard to 
implement due to dozen of external linked libraries that has their own 
signal masks and conflicts with seccomp. I've already tried this 
approach in the past (you can find in the list by searching for debug mode)

The optimal goal here is to use virt-test and audit log to eliminate 
these sorts of things.
Corey Bryant Dec. 4, 2013, 2:46 p.m. UTC | #16
On 12/04/2013 08:21 AM, Eduardo Otubo wrote:
>
>
> On 12/04/2013 07:39 AM, Stefan Hajnoczi wrote:
>> On Fri, Nov 22, 2013 at 11:00:24AM -0500, Paul Moore wrote:
>>>> Developers will only be happy with seccomp if it's easy and
>>>> rewarding to
>>>> support/debug.
>>>
>>> Agreed.
>>>
>>> As a developer, how do you feel about the audit/syslog based approach I
>>> mentioned earlier?
>>
>> I used the commands you posted (I think that's what you mean).  They
>> produce useful output.
>>
>> The problem is that without an error message on stderr or from the
>> shell, no one will think "QEMU process dead and hung == check seccomp"
>> immediately.  It's frustrating to deal with a "silent" failure.
>
> The process dies with a SIGKILL, and sig handling in Qemu is hard to
> implement due to dozen of external linked libraries that has their own
> signal masks and conflicts with seccomp. I've already tried this
> approach in the past (you can find in the list by searching for debug mode)

And just to be clear, the signal handling approach was only for debug 
purposes.

There are basically three ways to fail a syscall with seccomp:

SECCOMP_RET_KILL - kernel kills the task immediately without executing 
syscall
SECCOMP_RET_TRAP - kernel sends SIGSYS to the task without executing syscall
SECCOMP_RET_ERRNO - kernel returns an errno to the task wtihout 
executing syscall

You could issue a better error messages if you used TRAP or ERRNO, but 
giving control back to QEMU after (presumably) arbitrary code is being 
executed sort of defeats the purpose.
Stefan Hajnoczi Dec. 5, 2013, 1:15 p.m. UTC | #17
On Wed, Dec 04, 2013 at 11:21:12AM -0200, Eduardo Otubo wrote:
> On 12/04/2013 07:39 AM, Stefan Hajnoczi wrote:
> >On Fri, Nov 22, 2013 at 11:00:24AM -0500, Paul Moore wrote:
> >>>Developers will only be happy with seccomp if it's easy and rewarding to
> >>>support/debug.
> >>
> >>Agreed.
> >>
> >>As a developer, how do you feel about the audit/syslog based approach I
> >>mentioned earlier?
> >
> >I used the commands you posted (I think that's what you mean).  They
> >produce useful output.
> >
> >The problem is that without an error message on stderr or from the
> >shell, no one will think "QEMU process dead and hung == check seccomp"
> >immediately.  It's frustrating to deal with a "silent" failure.
> 
> The process dies with a SIGKILL, and sig handling in Qemu is hard to
> implement due to dozen of external linked libraries that has their
> own signal masks and conflicts with seccomp. I've already tried this
> approach in the past (you can find in the list by searching for
> debug mode)

I now realize we may be talking past each other.  Dying with
SIGKILL/SIGSYS is perfectly reasonable and I would be happy with that
:-).

But I think there's a bug in seccomp: a multi-threaded process can be
left in a zombie state.  In my case the primary thread was killed by
seccomp but another thread was deadlocked on a futex.

The result is the process isn't quite dead yet.  The shell will not reap
it and we're stuck with a zombie.

I can reproduce it reliably when I run "qemu-system-x86_64 -sandbox on"
on Fedora 20 (qemu-system-x86-1.6.1-2).

Should seccomp use do_group_exit() for SIGKILL?

Stefan
Will Drewry Dec. 5, 2013, 4:12 p.m. UTC | #18
On Thu, Dec 5, 2013 at 7:15 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Wed, Dec 04, 2013 at 11:21:12AM -0200, Eduardo Otubo wrote:
>> On 12/04/2013 07:39 AM, Stefan Hajnoczi wrote:
>> >On Fri, Nov 22, 2013 at 11:00:24AM -0500, Paul Moore wrote:
>> >>>Developers will only be happy with seccomp if it's easy and rewarding to
>> >>>support/debug.
>> >>
>> >>Agreed.
>> >>
>> >>As a developer, how do you feel about the audit/syslog based approach I
>> >>mentioned earlier?
>> >
>> >I used the commands you posted (I think that's what you mean).  They
>> >produce useful output.
>> >
>> >The problem is that without an error message on stderr or from the
>> >shell, no one will think "QEMU process dead and hung == check seccomp"
>> >immediately.  It's frustrating to deal with a "silent" failure.
>>
>> The process dies with a SIGKILL, and sig handling in Qemu is hard to
>> implement due to dozen of external linked libraries that has their
>> own signal masks and conflicts with seccomp. I've already tried this
>> approach in the past (you can find in the list by searching for
>> debug mode)
>
> I now realize we may be talking past each other.  Dying with
> SIGKILL/SIGSYS is perfectly reasonable and I would be happy with that
> :-).
>
> But I think there's a bug in seccomp: a multi-threaded process can be
> left in a zombie state.  In my case the primary thread was killed by
> seccomp but another thread was deadlocked on a futex.
>
> The result is the process isn't quite dead yet.  The shell will not reap
> it and we're stuck with a zombie.
>
> I can reproduce it reliably when I run "qemu-system-x86_64 -sandbox on"
> on Fedora 20 (qemu-system-x86-1.6.1-2).
>
> Should seccomp use do_group_exit() for SIGKILL?

Is the problem that the SECCOMP_RET_KILL didn't take down the thread
group (which would be a departure from how seccomp(mode=1) worked) and
causes the deadlock somehow, or is it that the other thread is
deadlocked?

Regardless, adding a SECCOMP_RET_TGKILL probably isn't a bad idea :)

cheers!
will
Stefan Hajnoczi Dec. 6, 2013, 9:13 a.m. UTC | #19
On Thu, Dec 05, 2013 at 10:12:00AM -0600, Will Drewry wrote:
> On Thu, Dec 5, 2013 at 7:15 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > On Wed, Dec 04, 2013 at 11:21:12AM -0200, Eduardo Otubo wrote:
> >> On 12/04/2013 07:39 AM, Stefan Hajnoczi wrote:
> >> >On Fri, Nov 22, 2013 at 11:00:24AM -0500, Paul Moore wrote:
> >> >>>Developers will only be happy with seccomp if it's easy and rewarding to
> >> >>>support/debug.
> >> >>
> >> >>Agreed.
> >> >>
> >> >>As a developer, how do you feel about the audit/syslog based approach I
> >> >>mentioned earlier?
> >> >
> >> >I used the commands you posted (I think that's what you mean).  They
> >> >produce useful output.
> >> >
> >> >The problem is that without an error message on stderr or from the
> >> >shell, no one will think "QEMU process dead and hung == check seccomp"
> >> >immediately.  It's frustrating to deal with a "silent" failure.
> >>
> >> The process dies with a SIGKILL, and sig handling in Qemu is hard to
> >> implement due to dozen of external linked libraries that has their
> >> own signal masks and conflicts with seccomp. I've already tried this
> >> approach in the past (you can find in the list by searching for
> >> debug mode)
> >
> > I now realize we may be talking past each other.  Dying with
> > SIGKILL/SIGSYS is perfectly reasonable and I would be happy with that
> > :-).
> >
> > But I think there's a bug in seccomp: a multi-threaded process can be
> > left in a zombie state.  In my case the primary thread was killed by
> > seccomp but another thread was deadlocked on a futex.
> >
> > The result is the process isn't quite dead yet.  The shell will not reap
> > it and we're stuck with a zombie.
> >
> > I can reproduce it reliably when I run "qemu-system-x86_64 -sandbox on"
> > on Fedora 20 (qemu-system-x86-1.6.1-2).
> >
> > Should seccomp use do_group_exit() for SIGKILL?
> 
> Is the problem that the SECCOMP_RET_KILL didn't take down the thread
> group (which would be a departure from how seccomp(mode=1) worked) and
> causes the deadlock somehow, or is it that the other thread is
> deadlocked?

The former.

When the first thread is killed by seccomp, the second thread in the
process is left waiting on a futex forever.  Therefore the process never
exits after the seccomp violation occurs.

Directing the signal at a thread makes perfect sense for
SECCOMP_RET_TRAP since the thread can handle the signal and recover.
But for SECCOMP_RET_KILL it's probably more useful to kill the entire
process rather than just a single thread.

> Regardless, adding a SECCOMP_RET_TGKILL probably isn't a bad idea :)

Yes.  Do you have time for that or would you like me to send a patch?

Stefan
Will Drewry Dec. 6, 2013, 3:40 p.m. UTC | #20
On Fri, Dec 6, 2013 at 3:13 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Thu, Dec 05, 2013 at 10:12:00AM -0600, Will Drewry wrote:
>> On Thu, Dec 5, 2013 at 7:15 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> > On Wed, Dec 04, 2013 at 11:21:12AM -0200, Eduardo Otubo wrote:
>> >> On 12/04/2013 07:39 AM, Stefan Hajnoczi wrote:
>> >> >On Fri, Nov 22, 2013 at 11:00:24AM -0500, Paul Moore wrote:
>> >> >>>Developers will only be happy with seccomp if it's easy and rewarding to
>> >> >>>support/debug.
>> >> >>
>> >> >>Agreed.
>> >> >>
>> >> >>As a developer, how do you feel about the audit/syslog based approach I
>> >> >>mentioned earlier?
>> >> >
>> >> >I used the commands you posted (I think that's what you mean).  They
>> >> >produce useful output.
>> >> >
>> >> >The problem is that without an error message on stderr or from the
>> >> >shell, no one will think "QEMU process dead and hung == check seccomp"
>> >> >immediately.  It's frustrating to deal with a "silent" failure.
>> >>
>> >> The process dies with a SIGKILL, and sig handling in Qemu is hard to
>> >> implement due to dozen of external linked libraries that has their
>> >> own signal masks and conflicts with seccomp. I've already tried this
>> >> approach in the past (you can find in the list by searching for
>> >> debug mode)
>> >
>> > I now realize we may be talking past each other.  Dying with
>> > SIGKILL/SIGSYS is perfectly reasonable and I would be happy with that
>> > :-).
>> >
>> > But I think there's a bug in seccomp: a multi-threaded process can be
>> > left in a zombie state.  In my case the primary thread was killed by
>> > seccomp but another thread was deadlocked on a futex.
>> >
>> > The result is the process isn't quite dead yet.  The shell will not reap
>> > it and we're stuck with a zombie.
>> >
>> > I can reproduce it reliably when I run "qemu-system-x86_64 -sandbox on"
>> > on Fedora 20 (qemu-system-x86-1.6.1-2).
>> >
>> > Should seccomp use do_group_exit() for SIGKILL?
>>
>> Is the problem that the SECCOMP_RET_KILL didn't take down the thread
>> group (which would be a departure from how seccomp(mode=1) worked) and
>> causes the deadlock somehow, or is it that the other thread is
>> deadlocked?
>
> The former.
>
> When the first thread is killed by seccomp, the second thread in the
> process is left waiting on a futex forever.  Therefore the process never
> exits after the seccomp violation occurs.
>
> Directing the signal at a thread makes perfect sense for
> SECCOMP_RET_TRAP since the thread can handle the signal and recover.
> But for SECCOMP_RET_KILL it's probably more useful to kill the entire
> process rather than just a single thread.

Would it be possible to just have the offending thread die with
SECCOMP_RET_TRAP then have a SIGSYS handler that calls tgkill?

>
>> Regardless, adding a SECCOMP_RET_TGKILL probably isn't a bad idea :)
>
> Yes.  Do you have time for that or would you like me to send a patch?

A straight SECCOMP_RET_TGKILL code could be a bit awkward, but it
might make sense to add tgkill as "data" OR'd with RET_KILL since
those 16 bits of data are still unused.  I didn't have a clear plan
for those bits with RET_KILL, but I think they would be fair game for
this sort of extension if it is really impractical to use a trap.
I'll play with a patch next week, but I'd be happy to see alternative
approaches too!  (I know I've been kicking around a separate patch for
apply per-task behavioral flags for filters that this could fit in
too, but there will be some ABI challenges that have led me to
approach it _very_ slowly.)

thanks!
Stefan Hajnoczi Dec. 7, 2013, 8:13 a.m. UTC | #21
On Fri, Dec 6, 2013 at 4:40 PM, Will Drewry <wad@chromium.org> wrote:
> On Fri, Dec 6, 2013 at 3:13 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> On Thu, Dec 05, 2013 at 10:12:00AM -0600, Will Drewry wrote:
>>> On Thu, Dec 5, 2013 at 7:15 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>>> > On Wed, Dec 04, 2013 at 11:21:12AM -0200, Eduardo Otubo wrote:
>>> >> On 12/04/2013 07:39 AM, Stefan Hajnoczi wrote:
>>> >> >On Fri, Nov 22, 2013 at 11:00:24AM -0500, Paul Moore wrote:
>>> >> >>>Developers will only be happy with seccomp if it's easy and rewarding to
>>> >> >>>support/debug.
>>> >> >>
>>> >> >>Agreed.
>>> >> >>
>>> >> >>As a developer, how do you feel about the audit/syslog based approach I
>>> >> >>mentioned earlier?
>>> >> >
>>> >> >I used the commands you posted (I think that's what you mean).  They
>>> >> >produce useful output.
>>> >> >
>>> >> >The problem is that without an error message on stderr or from the
>>> >> >shell, no one will think "QEMU process dead and hung == check seccomp"
>>> >> >immediately.  It's frustrating to deal with a "silent" failure.
>>> >>
>>> >> The process dies with a SIGKILL, and sig handling in Qemu is hard to
>>> >> implement due to dozen of external linked libraries that has their
>>> >> own signal masks and conflicts with seccomp. I've already tried this
>>> >> approach in the past (you can find in the list by searching for
>>> >> debug mode)
>>> >
>>> > I now realize we may be talking past each other.  Dying with
>>> > SIGKILL/SIGSYS is perfectly reasonable and I would be happy with that
>>> > :-).
>>> >
>>> > But I think there's a bug in seccomp: a multi-threaded process can be
>>> > left in a zombie state.  In my case the primary thread was killed by
>>> > seccomp but another thread was deadlocked on a futex.
>>> >
>>> > The result is the process isn't quite dead yet.  The shell will not reap
>>> > it and we're stuck with a zombie.
>>> >
>>> > I can reproduce it reliably when I run "qemu-system-x86_64 -sandbox on"
>>> > on Fedora 20 (qemu-system-x86-1.6.1-2).
>>> >
>>> > Should seccomp use do_group_exit() for SIGKILL?
>>>
>>> Is the problem that the SECCOMP_RET_KILL didn't take down the thread
>>> group (which would be a departure from how seccomp(mode=1) worked) and
>>> causes the deadlock somehow, or is it that the other thread is
>>> deadlocked?
>>
>> The former.
>>
>> When the first thread is killed by seccomp, the second thread in the
>> process is left waiting on a futex forever.  Therefore the process never
>> exits after the seccomp violation occurs.
>>
>> Directing the signal at a thread makes perfect sense for
>> SECCOMP_RET_TRAP since the thread can handle the signal and recover.
>> But for SECCOMP_RET_KILL it's probably more useful to kill the entire
>> process rather than just a single thread.
>
> Would it be possible to just have the offending thread die with
> SECCOMP_RET_TRAP then have a SIGSYS handler that calls tgkill?

It is more secure if the kernel kills the process.  If we trap back
into the process then an attacker may be able to avoid tgkill suicide
:).

I know usually an attacker would just avoid a seccomp violation in the
first place once they've taken control, but there are probably
situations where it matters.

Stefan
diff mbox

Patch

diff --git a/qemu-options.hx b/qemu-options.hx
index 5dc8b75..315a86d 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2982,13 +2982,13 @@  Old param mode (ARM only).
 ETEXI
 
 DEF("sandbox", HAS_ARG, QEMU_OPTION_sandbox, \
-    "-sandbox <arg>  Enable seccomp mode 2 system call filter (default 'off').\n",
+    "-sandbox <arg>  Enable seccomp mode 2 system call filter (default 'on').\n",
     QEMU_ARCH_ALL)
 STEXI
 @item -sandbox @var{arg}
 @findex -sandbox
 Enable Seccomp mode 2 system call filter. 'on' will enable syscall filtering and 'off' will
-disable it.  The default is 'off'.
+disable it.  The default is 'on'.
 ETEXI
 
 DEF("readconfig", HAS_ARG, QEMU_OPTION_readconfig,
diff --git a/vl.c b/vl.c
index b42ac67..ae3bdc9 100644
--- a/vl.c
+++ b/vl.c
@@ -529,6 +529,20 @@  static QemuOptsList qemu_msg_opts = {
     },
 };
 
+static QemuOpts *qemu_get_sandbox_opts(void)
+{
+    QemuOptsList *list;
+    QemuOpts *opts;
+
+    list = qemu_find_opts("sandbox");
+    assert(list);
+    opts = qemu_opts_find(list, NULL);
+    if (!opts) {
+        opts = qemu_opts_create_nofail(list);
+    }
+    return opts;
+}
+
 /**
  * Get machine options
  *
@@ -960,24 +974,9 @@  static int bt_parse(const char *opt)
     return 1;
 }
 
-static int parse_sandbox(QemuOpts *opts, void *opaque)
+static bool sandbox_enabled(bool default_usb)
 {
-    /* FIXME: change this to true for 1.3 */
-    if (qemu_opt_get_bool(opts, "enable", false)) {
-#ifdef CONFIG_SECCOMP
-        if (seccomp_start() < 0) {
-            qerror_report(ERROR_CLASS_GENERIC_ERROR,
-                          "failed to install seccomp syscall filter in the kernel");
-            return -1;
-        }
-#else
-        qerror_report(ERROR_CLASS_GENERIC_ERROR,
-                      "sandboxing request but seccomp is not compiled into this build");
-        return -1;
-#endif
-    }
-
-    return 0;
+    return qemu_opt_get_bool(qemu_get_sandbox_opts(), "sandbox", default_usb);
 }
 
 bool usb_enabled(bool default_usb)
@@ -3806,8 +3805,18 @@  int main(int argc, char **argv, char **envp)
         exit(1);
     }
 
-    if (qemu_opts_foreach(qemu_find_opts("sandbox"), parse_sandbox, NULL, 0)) {
-        exit(1);
+    if (sandbox_enabled(true)) {
+#ifdef CONFIG_SECCOMP
+        if (seccomp_start() < 0) {
+            qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                          "failed to install seccomp syscall filter in the kernel");
+            return -1;
+        }
+#else
+        qerror_report(ERROR_CLASS_GENERIC_ERROR,
+                      "sandboxing request but seccomp is not compiled into this build");
+        return -1;
+#endif
     }
 
 #ifndef _WIN32