diff mbox

[v2] unix: properly account for FDs passed over unix sockets

Message ID 201601100657.u0A6vk1B025554@mail.home.local
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Willy Tarreau Jan. 9, 2016, 8:54 p.m. UTC
It is possible for a process to allocate and accumulate far more FDs than
the process' limit by sending them over a unix socket then closing them
to keep the process' fd count low.

This change addresses this problem by keeping track of the number of FDs
in flight per user and preventing non-privileged processes from having
more FDs in flight than their configured FD limit.

Reported-by: socketpair@gmail.com
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Mitigates: CVE-2013-4312 (Linux 2.0+)
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Willy Tarreau <w@1wt.eu>
---
v2: add reported-by, mitigates and acked-by.

It would be nice if (if accepted) it would be backported to -stable as the
issue is currently exploitable.
---
 include/linux/sched.h |  1 +
 net/unix/af_unix.c    | 24 ++++++++++++++++++++----
 net/unix/garbage.c    | 13 ++++++++-----
 3 files changed, 29 insertions(+), 9 deletions(-)

Comments

David Miller Jan. 11, 2016, 5:05 a.m. UTC | #1
From: Willy Tarreau <w@1wt.eu>
Date: Sun, Jan 10 07:54:56 CET 2016

> It is possible for a process to allocate and accumulate far more FDs than
> the process' limit by sending them over a unix socket then closing them
> to keep the process' fd count low.
> 
> This change addresses this problem by keeping track of the number of FDs
> in flight per user and preventing non-privileged processes from having
> more FDs in flight than their configured FD limit.
> 
> Reported-by: socketpair@gmail.com
> Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Mitigates: CVE-2013-4312 (Linux 2.0+)
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Signed-off-by: Willy Tarreau <w@1wt.eu>

Applied and queued up for -stable, thanks!
David Herrmann Feb. 2, 2016, 5:34 p.m. UTC | #2
Hi

On Sun, Jan 10, 2016 at 7:54 AM, Willy Tarreau <w@1wt.eu> wrote:
> It is possible for a process to allocate and accumulate far more FDs than
> the process' limit by sending them over a unix socket then closing them
> to keep the process' fd count low.
>
> This change addresses this problem by keeping track of the number of FDs
> in flight per user and preventing non-privileged processes from having
> more FDs in flight than their configured FD limit.

This is not really what this patch does. This patch rather prevents
processes from having more of their owned *files* in flight than their
configured FD limit. See below for details..

> Reported-by: socketpair@gmail.com
> Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Mitigates: CVE-2013-4312 (Linux 2.0+)
> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> Signed-off-by: Willy Tarreau <w@1wt.eu>
> ---
> v2: add reported-by, mitigates and acked-by.
>
> It would be nice if (if accepted) it would be backported to -stable as the
> issue is currently exploitable.
> ---
>  include/linux/sched.h |  1 +
>  net/unix/af_unix.c    | 24 ++++++++++++++++++++----
>  net/unix/garbage.c    | 13 ++++++++-----
>  3 files changed, 29 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index edad7a4..fbf25f1 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -830,6 +830,7 @@ struct user_struct {
>         unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
>  #endif
>         unsigned long locked_shm; /* How many pages of mlocked shm ? */
> +       unsigned long unix_inflight;    /* How many files in flight in unix sockets */
>
>  #ifdef CONFIG_KEYS
>         struct key *uid_keyring;        /* UID specific keyring */
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 45aebd9..d6d7b43 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -1499,6 +1499,21 @@ static void unix_destruct_scm(struct sk_buff *skb)
>         sock_wfree(skb);
>  }
>
> +/*
> + * The "user->unix_inflight" variable is protected by the garbage
> + * collection lock, and we just read it locklessly here. If you go
> + * over the limit, there might be a tiny race in actually noticing
> + * it across threads. Tough.

This limit is checked once per transaction. IIRC, a transaction can
carry 255 files. Thus, it is easy to exceed this limit without any
race involved.

> + */
> +static inline bool too_many_unix_fds(struct task_struct *p)
> +{
> +       struct user_struct *user = current_user();
> +
> +       if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
> +               return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
> +       return false;
> +}
> +
>  #define MAX_RECURSION_LEVEL 4
>
>  static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
> @@ -1507,6 +1522,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
>         unsigned char max_level = 0;
>         int unix_sock_count = 0;
>
> +       if (too_many_unix_fds(current))
> +               return -ETOOMANYREFS;
> +

Ignoring the capabilities, this effectively resolves to:

    if (current_cred()->user->unix_inflight > rlimit(RLIMIT_NOFILE))
            return -ETOOMANYREFS;

It limits the number of inflight FDs of the *current* user to its *own* limit.

But..

>         for (i = scm->fp->count - 1; i >= 0; i--) {
>                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
>
> @@ -1528,10 +1546,8 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
>         if (!UNIXCB(skb).fp)
>                 return -ENOMEM;
>
> -       if (unix_sock_count) {
> -               for (i = scm->fp->count - 1; i >= 0; i--)
> -                       unix_inflight(scm->fp->fp[i]);
> -       }
> +       for (i = scm->fp->count - 1; i >= 0; i--)
> +               unix_inflight(scm->fp->fp[i]);
>         return max_level;
>  }
>
> diff --git a/net/unix/garbage.c b/net/unix/garbage.c
> index a73a226..8fcdc22 100644
> --- a/net/unix/garbage.c
> +++ b/net/unix/garbage.c
> @@ -120,11 +120,11 @@ void unix_inflight(struct file *fp)
>  {
>         struct sock *s = unix_get_socket(fp);
>
> +       spin_lock(&unix_gc_lock);
> +
>         if (s) {
>                 struct unix_sock *u = unix_sk(s);
>
> -               spin_lock(&unix_gc_lock);
> -
>                 if (atomic_long_inc_return(&u->inflight) == 1) {
>                         BUG_ON(!list_empty(&u->link));
>                         list_add_tail(&u->link, &gc_inflight_list);
> @@ -132,25 +132,28 @@ void unix_inflight(struct file *fp)
>                         BUG_ON(list_empty(&u->link));
>                 }
>                 unix_tot_inflight++;
> -               spin_unlock(&unix_gc_lock);
>         }
> +       fp->f_cred->user->unix_inflight++;

..this modifies the limit of the owner of the file you send. As such,
if you only send files that you don't own, you will continuously bump
the limit of the file-owner, but never your own. As such, the
protection above will never fire.

Is this really what this patch is supposed to do? Shouldn't the loop
in unix_attach_fds() rather check for this:

    if (unlikely(fp->f_cred->user->unix_inflight > nofile &&
            !file_ns_capable(fp, &init_user_ns, CAP_SYS_RESOURCE) &&
            !file_ns_capable(fp, &init_user_ns, CAP_SYS_ADMIN)))
            return -ETOOMANYREFS;

(or keeping capable() rather than file_ns_capable(), depending on the
wanted semantics)

Furthermore, with this patch in place, a process better not pass any
file-descriptors to an untrusted process. This might have been a
stupid idea in the first place, but I would have expected the patch to
mention this. Because with this patch in place, a receiver of a
file-descriptor can bump the unix_inflight limit of the sender
arbitrarily. Did anyone notify the dbus maintainers of this? They
might wanna document this, if not already done (CC: smcv).

Why doesn't this patch modify "unix_inflight" of the sender rather
than the passed descriptors? It would require pinning the affected
user in 'scm' contexts, but that is probably already the case, anyway.
This way, the original ETOOMANYREFS check would be perfectly fine.

Anyway, can someone provide a high-level description of what exactly
this patch is supposed to do? Which operation should be limited, who
should inflight FDs be accounted on, and which rlimit should be used
on each operation? I'm having a hard time auditing existing
user-space, given just the scarce description of this commit.

Thanks
David

> +       spin_unlock(&unix_gc_lock);
>  }
>
>  void unix_notinflight(struct file *fp)
>  {
>         struct sock *s = unix_get_socket(fp);
>
> +       spin_lock(&unix_gc_lock);
> +
>         if (s) {
>                 struct unix_sock *u = unix_sk(s);
>
> -               spin_lock(&unix_gc_lock);
>                 BUG_ON(list_empty(&u->link));
>
>                 if (atomic_long_dec_and_test(&u->inflight))
>                         list_del_init(&u->link);
>                 unix_tot_inflight--;
> -               spin_unlock(&unix_gc_lock);
>         }
> +       fp->f_cred->user->unix_inflight--;
> +       spin_unlock(&unix_gc_lock);
>  }
>
>  static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
> --
> 1.7.12.1
>
>
>
Simon McVittie Feb. 3, 2016, 11:36 a.m. UTC | #3
On 02/02/16 17:34, David Herrmann wrote:
> Furthermore, with this patch in place, a process better not pass any
> file-descriptors to an untrusted process.
...
> Did anyone notify the dbus maintainers of this? They
> might wanna document this, if not already done (CC: smcv).

Sorry, I'm not clear from this message on what patterns I should be
documenting as bad, and what the effect of non-compliance would be.

dbus-daemon has a fd-passing feature, which uses AF_UNIX sockets'
existing ability to pass fds to let users of D-Bus attach fds to their
messages. The message is passed from the sending client to dbus-daemon,
then from dbus-daemon to the recipient:

             AF_UNIX             AF_UNIX
               |                    |
    sender -------> dbus-daemon -------> recipient
               |                    |

This has been API since before I was a D-Bus maintainer, so I have no
influence over its existence; just like the kernel doesn't want to break
user-space, dbus-daemon doesn't want to break its users.

The system dbus-daemon (dbus-daemon --system) is a privilege boundary,
and accepts senders and recipients with differing privileges. Without
configuration, messages are denied by default. Recipients can open this
up (by installing system-wide configuration) to allow arbitrary
processes to send messages to them, so that they can carry out their own
discretionary access control. Since 2014, the system dbus-daemon accepts
up to 16 file descriptors per message by default.

There is also a session or user dbus-daemon (dbus-daemon --session) per
uid, but that isn't normally a privilege boundary, so any user trying to
carry out a denial of service there is only hurting themselves.

Am I right in saying that the advice I give to D-Bus users should be
something like this?

* system services should not send fds at all, unless they trust the
  dbus-daemon
* system services should not send fds via D-Bus that will be delivered
  to recipients that they do not trust
* sending fds to an untrusted recipient would enable that recipient to
  carry out a denial-of-service attack (on what? the sender? the
  dbus-daemon?)
Hannes Frederic Sowa Feb. 3, 2016, 11:56 a.m. UTC | #4
On 03.02.2016 12:36, Simon McVittie wrote:
> On 02/02/16 17:34, David Herrmann wrote:
>> Furthermore, with this patch in place, a process better not pass any
>> file-descriptors to an untrusted process.
> ...
>> Did anyone notify the dbus maintainers of this? They
>> might wanna document this, if not already done (CC: smcv).
>
> Sorry, I'm not clear from this message on what patterns I should be
> documenting as bad, and what the effect of non-compliance would be.
>
> dbus-daemon has a fd-passing feature, which uses AF_UNIX sockets'
> existing ability to pass fds to let users of D-Bus attach fds to their
> messages. The message is passed from the sending client to dbus-daemon,
> then from dbus-daemon to the recipient:
>
>               AF_UNIX             AF_UNIX
>                 |                    |
>      sender -------> dbus-daemon -------> recipient
>                 |                    |
>
> This has been API since before I was a D-Bus maintainer, so I have no
> influence over its existence; just like the kernel doesn't want to break
> user-space, dbus-daemon doesn't want to break its users.
>
> The system dbus-daemon (dbus-daemon --system) is a privilege boundary,
> and accepts senders and recipients with differing privileges. Without
> configuration, messages are denied by default. Recipients can open this
> up (by installing system-wide configuration) to allow arbitrary
> processes to send messages to them, so that they can carry out their own
> discretionary access control. Since 2014, the system dbus-daemon accepts
> up to 16 file descriptors per message by default.
>
> There is also a session or user dbus-daemon (dbus-daemon --session) per
> uid, but that isn't normally a privilege boundary, so any user trying to
> carry out a denial of service there is only hurting themselves.
>
> Am I right in saying that the advice I give to D-Bus users should be
> something like this?
>
> * system services should not send fds at all, unless they trust the
>    dbus-daemon
> * system services should not send fds via D-Bus that will be delivered
>    to recipients that they do not trust
> * sending fds to an untrusted recipient would enable that recipient to
>    carry out a denial-of-service attack (on what? the sender? the
>    dbus-daemon?)
>

The described behavior was simply a bug in the referenced patch. I 
already posted a follow-up to change this behavior so that only the 
current sending process is credited with the number of fds in flight:

<https://patchwork.ozlabs.org/patch/577653/>

Other processes (in this case the original opener of the file) isn't 
credited anymore if it does not send the fd itself.

That said, I don't think you need to change anything or give different 
advice because of this thread.

Thanks,
Hannes
David Herrmann Feb. 3, 2016, 11:56 a.m. UTC | #5
Hi

On Wed, Feb 3, 2016 at 12:36 PM, Simon McVittie
<simon.mcvittie@collabora.co.uk> wrote:
> Am I right in saying that the advice I give to D-Bus users should be
> something like this?
>
> * system services should not send fds at all, unless they trust the
>   dbus-daemon
> * system services should not send fds via D-Bus that will be delivered
>   to recipients that they do not trust
> * sending fds to an untrusted recipient would enable that recipient to
>   carry out a denial-of-service attack (on what? the sender? the
>   dbus-daemon?)

With the revised patch from Hannes, this should no longer be needed.
My original concern was only about accounting inflight-fds on the
file-owner, rather than the sender.

However, with Hannes' revised patch, a different DoS attack against
dbus-daemon is possible. Imagine a peer that receives batches of FDs,
but never dequeues them. They will be accounted on the inflight-limit
of dbus-daemon, as such causing messages of independent peers to be
rejected in case they carry FDs.
Preferably, dbus-daemon would avoid queuing more than 16 FDs on a
single destination (total). But that would require POLLOUT to be
capped by the number of queued fds. A possible workaround is to add
CAP_SYS_RESOURCE to dbus-daemon.

Thanks
David
Simon McVittie Feb. 3, 2016, 12:49 p.m. UTC | #6
On 03/02/16 11:56, David Herrmann wrote:
> However, with Hannes' revised patch, a different DoS attack against
> dbus-daemon is possible. Imagine a peer that receives batches of FDs,
> but never dequeues them. They will be accounted on the inflight-limit
> of dbus-daemon, as such causing messages of independent peers to be
> rejected in case they carry FDs.
> Preferably, dbus-daemon would avoid queuing more than 16 FDs on a
> single destination (total). But that would require POLLOUT to be
> capped by the number of queued fds. A possible workaround is to add
> CAP_SYS_RESOURCE to dbus-daemon.

We have several mitigations for this:

* there's a limit on outgoing fds queued inside dbus-daemon for a
  particular recipient connection, currently 64, and if that's
  exceeded, we stop accepting messages for that recipient, feeding back
  a send failure to the sender for messages to that recipient;
* there's a time limit for how long any given fd can stay queued up
  inside dbus-daemon, currently 2.5 minutes, and if that's exceeded, we
  drop the message;
* if started as root[1], we increase our fd limit to 64k before
  dropping privileges to the intended uid, which in combination with
  the maximum of 256 connections per uid and 64 fds per connection means
  it takes multiple uids working together to carry out a DoS;
* all of those limits can be adjusted by a local sysadmin if necessary,
  if their users are particularly hostile (for instance 16 fds per
  message is a generous limit intended to avoid unforeseen breakage,
  and 4 would probably be enough in practice)

The queueing logic is fairly involved, so I'd appreciate suggested
patches on freedesktop.org Bugzilla or to
dbus-security@lists.freedesktop.org if you have an idea for better
limits. If you believe you have found a non-public vulnerability, please
mark any Bugzilla bugs as restricted to the D-Bus security group.

Thanks,
    S

[1] The system dbus-daemon is started as root (and hence able to
increase its own fd limit) on all systemd systems, anything that uses
the Red Hat or Slackware sysvinit scripts bundled with dbus, and any
Debian derivatives that use sysvinit and have taken security updates
from at least Debian 7. Other distro-specific init system glue is up to
the relevant distro.
Hannes Frederic Sowa Feb. 3, 2016, 2:07 p.m. UTC | #7
On 03.02.2016 12:56, David Herrmann wrote:
> However, with Hannes' revised patch, a different DoS attack against
> dbus-daemon is possible. Imagine a peer that receives batches of FDs,
> but never dequeues them. They will be accounted on the inflight-limit
> of dbus-daemon, as such causing messages of independent peers to be
> rejected in case they carry FDs.

Yes, that is true.

We also kind of have the problem with unconnected af-unix dgram sockets: 
if the receiver does not read the skbs on the receive queue we don't 
free up the sending socket's wmem, thus stop the socket from being 
destructed and can block the process during sendmsg on this socket. This 
is harder to DoS but pretty much the same schema.

Bye,
Hannes
diff mbox

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index edad7a4..fbf25f1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -830,6 +830,7 @@  struct user_struct {
 	unsigned long mq_bytes;	/* How many bytes can be allocated to mqueue? */
 #endif
 	unsigned long locked_shm; /* How many pages of mlocked shm ? */
+	unsigned long unix_inflight;	/* How many files in flight in unix sockets */
 
 #ifdef CONFIG_KEYS
 	struct key *uid_keyring;	/* UID specific keyring */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 45aebd9..d6d7b43 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1499,6 +1499,21 @@  static void unix_destruct_scm(struct sk_buff *skb)
 	sock_wfree(skb);
 }
 
+/*
+ * The "user->unix_inflight" variable is protected by the garbage
+ * collection lock, and we just read it locklessly here. If you go
+ * over the limit, there might be a tiny race in actually noticing
+ * it across threads. Tough.
+ */
+static inline bool too_many_unix_fds(struct task_struct *p)
+{
+	struct user_struct *user = current_user();
+
+	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
+		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+	return false;
+}
+
 #define MAX_RECURSION_LEVEL 4
 
 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
@@ -1507,6 +1522,9 @@  static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 	unsigned char max_level = 0;
 	int unix_sock_count = 0;
 
+	if (too_many_unix_fds(current))
+		return -ETOOMANYREFS;
+
 	for (i = scm->fp->count - 1; i >= 0; i--) {
 		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
 
@@ -1528,10 +1546,8 @@  static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 	if (!UNIXCB(skb).fp)
 		return -ENOMEM;
 
-	if (unix_sock_count) {
-		for (i = scm->fp->count - 1; i >= 0; i--)
-			unix_inflight(scm->fp->fp[i]);
-	}
+	for (i = scm->fp->count - 1; i >= 0; i--)
+		unix_inflight(scm->fp->fp[i]);
 	return max_level;
 }
 
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index a73a226..8fcdc22 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -120,11 +120,11 @@  void unix_inflight(struct file *fp)
 {
 	struct sock *s = unix_get_socket(fp);
 
+	spin_lock(&unix_gc_lock);
+
 	if (s) {
 		struct unix_sock *u = unix_sk(s);
 
-		spin_lock(&unix_gc_lock);
-
 		if (atomic_long_inc_return(&u->inflight) == 1) {
 			BUG_ON(!list_empty(&u->link));
 			list_add_tail(&u->link, &gc_inflight_list);
@@ -132,25 +132,28 @@  void unix_inflight(struct file *fp)
 			BUG_ON(list_empty(&u->link));
 		}
 		unix_tot_inflight++;
-		spin_unlock(&unix_gc_lock);
 	}
+	fp->f_cred->user->unix_inflight++;
+	spin_unlock(&unix_gc_lock);
 }
 
 void unix_notinflight(struct file *fp)
 {
 	struct sock *s = unix_get_socket(fp);
 
+	spin_lock(&unix_gc_lock);
+
 	if (s) {
 		struct unix_sock *u = unix_sk(s);
 
-		spin_lock(&unix_gc_lock);
 		BUG_ON(list_empty(&u->link));
 
 		if (atomic_long_dec_and_test(&u->inflight))
 			list_del_init(&u->link);
 		unix_tot_inflight--;
-		spin_unlock(&unix_gc_lock);
 	}
+	fp->f_cred->user->unix_inflight--;
+	spin_unlock(&unix_gc_lock);
 }
 
 static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),