Patchwork net: unix: fix inflight counting bug in garbage collector

login
register
mail settings
Submitter Miklos Szeredi
Date Nov. 9, 2008, 2:23 p.m.
Message ID <E1KzBCn-00062L-Fp@pomaz-ex.szeredi.hu>
Download mbox | patch
Permalink /patch/7915/
State Accepted
Delegated to: David Miller
Headers show

Comments

Miklos Szeredi - Nov. 9, 2008, 2:23 p.m.
Hi David,

This patch fixes the BUG_ON triggered by Andrea's tests.  It turned
out to be completely independent of the stack overflow issue, but
happens to be triggered by the same test program.

Should qualify for -stable too.

Thanks,
Miklos

----
From: Miklos Szeredi <mszeredi@suse.cz>

Previously I assumed that the receive queues of candidates don't
change during the GC.  This is only half true, nothing can be received
from the queues (see comment in unix_gc()), but buffers could be added
through the other half of the socket pair, which may still have file
descriptors referring to it.

This can result in inc_inflight_move_tail() erronously increasing the
"inflight" counter for a unix socket for which dec_inflight() wasn't
previously called.  This in turn can trigger the "BUG_ON(total_refs <
inflight_refs)" in a later garbage collection run.

Fix this by only manipulating the "inflight" counter for sockets which
are candidates themselves.  Duplicating the file references in
unix_attach_fds() is also needed to prevent a socket becoming a
candidate for GC while the skb that contains it is not yet queued.

Reported-by: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@kernel.org
---
 include/net/af_unix.h |    1 +
 net/unix/af_unix.c    |   31 ++++++++++++++++++++++++-------
 net/unix/garbage.c    |   49 +++++++++++++++++++++++++++++++++++++------------
 3 files changed, 62 insertions(+), 19 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller - Nov. 11, 2008, 6:59 a.m.
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sun, 09 Nov 2008 15:23:57 +0100

> This patch fixes the BUG_ON triggered by Andrea's tests.  It turned
> out to be completely independent of the stack overflow issue, but
> happens to be triggered by the same test program.
> 
> Should qualify for -stable too.

Miklos thanks a lot for fixing this.  And Linus thanks for
queueing this up to 2.6.28-rc4

Stable folks, please include this in -stable as soon as you
can, since the BUG can be triggered by local users.

> ----
> From: Miklos Szeredi <mszeredi@suse.cz>
> 
> Previously I assumed that the receive queues of candidates don't
> change during the GC.  This is only half true, nothing can be received
> from the queues (see comment in unix_gc()), but buffers could be added
> through the other half of the socket pair, which may still have file
> descriptors referring to it.
> 
> This can result in inc_inflight_move_tail() erronously increasing the
> "inflight" counter for a unix socket for which dec_inflight() wasn't
> previously called.  This in turn can trigger the "BUG_ON(total_refs <
> inflight_refs)" in a later garbage collection run.
> 
> Fix this by only manipulating the "inflight" counter for sockets which
> are candidates themselves.  Duplicating the file references in
> unix_attach_fds() is also needed to prevent a socket becoming a
> candidate for GC while the skb that contains it is not yet queued.
> 
> Reported-by: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
> Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
> CC: stable@kernel.org
> ---
>  include/net/af_unix.h |    1 +
>  net/unix/af_unix.c    |   31 ++++++++++++++++++++++++-------
>  net/unix/garbage.c    |   49 +++++++++++++++++++++++++++++++++++++------------
>  3 files changed, 62 insertions(+), 19 deletions(-)
> 
> Index: linux.git/include/net/af_unix.h
> ===================================================================
> --- linux.git.orig/include/net/af_unix.h	2008-11-09 14:39:04.000000000 +0100
> +++ linux.git/include/net/af_unix.h	2008-11-09 14:39:11.000000000 +0100
> @@ -54,6 +54,7 @@ struct unix_sock {
>          atomic_long_t           inflight;
>          spinlock_t		lock;
>  	unsigned int		gc_candidate : 1;
> +	unsigned int		gc_maybe_cycle : 1;
>          wait_queue_head_t       peer_wait;
>  };
>  #define unix_sk(__sk) ((struct unix_sock *)__sk)
> Index: linux.git/net/unix/garbage.c
> ===================================================================
> --- linux.git.orig/net/unix/garbage.c	2008-11-09 14:39:04.000000000 +0100
> +++ linux.git/net/unix/garbage.c	2008-11-09 14:39:11.000000000 +0100
> @@ -186,8 +186,17 @@ static void scan_inflight(struct sock *x
>  				 */
>  				struct sock *sk = unix_get_socket(*fp++);
>  				if (sk) {
> -					hit = true;
> -					func(unix_sk(sk));
> +					struct unix_sock *u = unix_sk(sk);
> +
> +					/*
> +					 * Ignore non-candidates, they could
> +					 * have been added to the queues after
> +					 * starting the garbage collection
> +					 */
> +					if (u->gc_candidate) {
> +						hit = true;
> +						func(u);
> +					}
>  				}
>  			}
>  			if (hit && hitlist != NULL) {
> @@ -249,11 +258,11 @@ static void inc_inflight_move_tail(struc
>  {
>  	atomic_long_inc(&u->inflight);
>  	/*
> -	 * If this is still a candidate, move it to the end of the
> -	 * list, so that it's checked even if it was already passed
> -	 * over
> +	 * If this still might be part of a cycle, move it to the end
> +	 * of the list, so that it's checked even if it was already
> +	 * passed over
>  	 */
> -	if (u->gc_candidate)
> +	if (u->gc_maybe_cycle)
>  		list_move_tail(&u->link, &gc_candidates);
>  }
>  
> @@ -267,6 +276,7 @@ void unix_gc(void)
>  	struct unix_sock *next;
>  	struct sk_buff_head hitlist;
>  	struct list_head cursor;
> +	LIST_HEAD(not_cycle_list);
>  
>  	spin_lock(&unix_gc_lock);
>  
> @@ -282,10 +292,14 @@ void unix_gc(void)
>  	 *
>  	 * Holding unix_gc_lock will protect these candidates from
>  	 * being detached, and hence from gaining an external
> -	 * reference.  This also means, that since there are no
> -	 * possible receivers, the receive queues of these sockets are
> -	 * static during the GC, even though the dequeue is done
> -	 * before the detach without atomicity guarantees.
> +	 * reference.  Since there are no possible receivers, all
> +	 * buffers currently on the candidates' queues stay there
> +	 * during the garbage collection.
> +	 *
> +	 * We also know that no new candidate can be added onto the
> +	 * receive queues.  Other, non candidate sockets _can_ be
> +	 * added to queue, so we must make sure only to touch
> +	 * candidates.
>  	 */
>  	list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
>  		long total_refs;
> @@ -299,6 +313,7 @@ void unix_gc(void)
>  		if (total_refs == inflight_refs) {
>  			list_move_tail(&u->link, &gc_candidates);
>  			u->gc_candidate = 1;
> +			u->gc_maybe_cycle = 1;
>  		}
>  	}
>  
> @@ -325,14 +340,24 @@ void unix_gc(void)
>  		list_move(&cursor, &u->link);
>  
>  		if (atomic_long_read(&u->inflight) > 0) {
> -			list_move_tail(&u->link, &gc_inflight_list);
> -			u->gc_candidate = 0;
> +			list_move_tail(&u->link, &not_cycle_list);
> +			u->gc_maybe_cycle = 0;
>  			scan_children(&u->sk, inc_inflight_move_tail, NULL);
>  		}
>  	}
>  	list_del(&cursor);
>  
>  	/*
> +	 * not_cycle_list contains those sockets which do not make up a
> +	 * cycle.  Restore these to the inflight list.
> +	 */
> +	while (!list_empty(&not_cycle_list)) {
> +		u = list_entry(not_cycle_list.next, struct unix_sock, link);
> +		u->gc_candidate = 0;
> +		list_move_tail(&u->link, &gc_inflight_list);
> +	}
> +
> +	/*
>  	 * Now gc_candidates contains only garbage.  Restore original
>  	 * inflight counters for these as well, and remove the skbuffs
>  	 * which are creating the cycle(s).
> Index: linux.git/net/unix/af_unix.c
> ===================================================================
> --- linux.git.orig/net/unix/af_unix.c	2008-11-09 14:39:04.000000000 +0100
> +++ linux.git/net/unix/af_unix.c	2008-11-09 14:39:11.000000000 +0100
> @@ -1302,14 +1302,23 @@ static void unix_destruct_fds(struct sk_
>  	sock_wfree(skb);
>  }
>  
> -static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
> +static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
>  {
>  	int i;
> +
> +	/*
> +	 * Need to duplicate file references for the sake of garbage
> +	 * collection.  Otherwise a socket in the fps might become a
> +	 * candidate for GC while the skb is not yet queued.
> +	 */
> +	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
> +	if (!UNIXCB(skb).fp)
> +		return -ENOMEM;
> +
>  	for (i=scm->fp->count-1; i>=0; i--)
>  		unix_inflight(scm->fp->fp[i]);
> -	UNIXCB(skb).fp = scm->fp;
>  	skb->destructor = unix_destruct_fds;
> -	scm->fp = NULL;
> +	return 0;
>  }
>  
>  /*
> @@ -1368,8 +1377,11 @@ static int unix_dgram_sendmsg(struct kio
>  		goto out;
>  
>  	memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
> -	if (siocb->scm->fp)
> -		unix_attach_fds(siocb->scm, skb);
> +	if (siocb->scm->fp) {
> +		err = unix_attach_fds(siocb->scm, skb);
> +		if (err)
> +			goto out_free;
> +	}
>  	unix_get_secdata(siocb->scm, skb);
>  
>  	skb_reset_transport_header(skb);
> @@ -1538,8 +1550,13 @@ static int unix_stream_sendmsg(struct ki
>  		size = min_t(int, size, skb_tailroom(skb));
>  
>  		memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
> -		if (siocb->scm->fp)
> -			unix_attach_fds(siocb->scm, skb);
> +		if (siocb->scm->fp) {
> +			err = unix_attach_fds(siocb->scm, skb);
> +			if (err) {
> +				kfree_skb(skb);
> +				goto out_err;
> +			}
> +		}
>  
>  		if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
>  			kfree_skb(skb);
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg KH - Nov. 11, 2008, 7:32 a.m.
On Mon, Nov 10, 2008 at 10:59:38PM -0800, David Miller wrote:
> From: Miklos Szeredi <miklos@szeredi.hu>
> Date: Sun, 09 Nov 2008 15:23:57 +0100
> 
> > This patch fixes the BUG_ON triggered by Andrea's tests.  It turned
> > out to be completely independent of the stack overflow issue, but
> > happens to be triggered by the same test program.
> > 
> > Should qualify for -stable too.
> 
> Miklos thanks a lot for fixing this.  And Linus thanks for
> queueing this up to 2.6.28-rc4
> 
> Stable folks, please include this in -stable as soon as you
> can, since the BUG can be triggered by local users.

Thanks, will do.

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

Index: linux.git/include/net/af_unix.h
===================================================================
--- linux.git.orig/include/net/af_unix.h	2008-11-09 14:39:04.000000000 +0100
+++ linux.git/include/net/af_unix.h	2008-11-09 14:39:11.000000000 +0100
@@ -54,6 +54,7 @@  struct unix_sock {
         atomic_long_t           inflight;
         spinlock_t		lock;
 	unsigned int		gc_candidate : 1;
+	unsigned int		gc_maybe_cycle : 1;
         wait_queue_head_t       peer_wait;
 };
 #define unix_sk(__sk) ((struct unix_sock *)__sk)
Index: linux.git/net/unix/garbage.c
===================================================================
--- linux.git.orig/net/unix/garbage.c	2008-11-09 14:39:04.000000000 +0100
+++ linux.git/net/unix/garbage.c	2008-11-09 14:39:11.000000000 +0100
@@ -186,8 +186,17 @@  static void scan_inflight(struct sock *x
 				 */
 				struct sock *sk = unix_get_socket(*fp++);
 				if (sk) {
-					hit = true;
-					func(unix_sk(sk));
+					struct unix_sock *u = unix_sk(sk);
+
+					/*
+					 * Ignore non-candidates, they could
+					 * have been added to the queues after
+					 * starting the garbage collection
+					 */
+					if (u->gc_candidate) {
+						hit = true;
+						func(u);
+					}
 				}
 			}
 			if (hit && hitlist != NULL) {
@@ -249,11 +258,11 @@  static void inc_inflight_move_tail(struc
 {
 	atomic_long_inc(&u->inflight);
 	/*
-	 * If this is still a candidate, move it to the end of the
-	 * list, so that it's checked even if it was already passed
-	 * over
+	 * If this still might be part of a cycle, move it to the end
+	 * of the list, so that it's checked even if it was already
+	 * passed over
 	 */
-	if (u->gc_candidate)
+	if (u->gc_maybe_cycle)
 		list_move_tail(&u->link, &gc_candidates);
 }
 
@@ -267,6 +276,7 @@  void unix_gc(void)
 	struct unix_sock *next;
 	struct sk_buff_head hitlist;
 	struct list_head cursor;
+	LIST_HEAD(not_cycle_list);
 
 	spin_lock(&unix_gc_lock);
 
@@ -282,10 +292,14 @@  void unix_gc(void)
 	 *
 	 * Holding unix_gc_lock will protect these candidates from
 	 * being detached, and hence from gaining an external
-	 * reference.  This also means, that since there are no
-	 * possible receivers, the receive queues of these sockets are
-	 * static during the GC, even though the dequeue is done
-	 * before the detach without atomicity guarantees.
+	 * reference.  Since there are no possible receivers, all
+	 * buffers currently on the candidates' queues stay there
+	 * during the garbage collection.
+	 *
+	 * We also know that no new candidate can be added onto the
+	 * receive queues.  Other, non candidate sockets _can_ be
+	 * added to queue, so we must make sure only to touch
+	 * candidates.
 	 */
 	list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
 		long total_refs;
@@ -299,6 +313,7 @@  void unix_gc(void)
 		if (total_refs == inflight_refs) {
 			list_move_tail(&u->link, &gc_candidates);
 			u->gc_candidate = 1;
+			u->gc_maybe_cycle = 1;
 		}
 	}
 
@@ -325,14 +340,24 @@  void unix_gc(void)
 		list_move(&cursor, &u->link);
 
 		if (atomic_long_read(&u->inflight) > 0) {
-			list_move_tail(&u->link, &gc_inflight_list);
-			u->gc_candidate = 0;
+			list_move_tail(&u->link, &not_cycle_list);
+			u->gc_maybe_cycle = 0;
 			scan_children(&u->sk, inc_inflight_move_tail, NULL);
 		}
 	}
 	list_del(&cursor);
 
 	/*
+	 * not_cycle_list contains those sockets which do not make up a
+	 * cycle.  Restore these to the inflight list.
+	 */
+	while (!list_empty(&not_cycle_list)) {
+		u = list_entry(not_cycle_list.next, struct unix_sock, link);
+		u->gc_candidate = 0;
+		list_move_tail(&u->link, &gc_inflight_list);
+	}
+
+	/*
 	 * Now gc_candidates contains only garbage.  Restore original
 	 * inflight counters for these as well, and remove the skbuffs
 	 * which are creating the cycle(s).
Index: linux.git/net/unix/af_unix.c
===================================================================
--- linux.git.orig/net/unix/af_unix.c	2008-11-09 14:39:04.000000000 +0100
+++ linux.git/net/unix/af_unix.c	2008-11-09 14:39:11.000000000 +0100
@@ -1302,14 +1302,23 @@  static void unix_destruct_fds(struct sk_
 	sock_wfree(skb);
 }
 
-static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 {
 	int i;
+
+	/*
+	 * Need to duplicate file references for the sake of garbage
+	 * collection.  Otherwise a socket in the fps might become a
+	 * candidate for GC while the skb is not yet queued.
+	 */
+	UNIXCB(skb).fp = scm_fp_dup(scm->fp);
+	if (!UNIXCB(skb).fp)
+		return -ENOMEM;
+
 	for (i=scm->fp->count-1; i>=0; i--)
 		unix_inflight(scm->fp->fp[i]);
-	UNIXCB(skb).fp = scm->fp;
 	skb->destructor = unix_destruct_fds;
-	scm->fp = NULL;
+	return 0;
 }
 
 /*
@@ -1368,8 +1377,11 @@  static int unix_dgram_sendmsg(struct kio
 		goto out;
 
 	memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
-	if (siocb->scm->fp)
-		unix_attach_fds(siocb->scm, skb);
+	if (siocb->scm->fp) {
+		err = unix_attach_fds(siocb->scm, skb);
+		if (err)
+			goto out_free;
+	}
 	unix_get_secdata(siocb->scm, skb);
 
 	skb_reset_transport_header(skb);
@@ -1538,8 +1550,13 @@  static int unix_stream_sendmsg(struct ki
 		size = min_t(int, size, skb_tailroom(skb));
 
 		memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
-		if (siocb->scm->fp)
-			unix_attach_fds(siocb->scm, skb);
+		if (siocb->scm->fp) {
+			err = unix_attach_fds(siocb->scm, skb);
+			if (err) {
+				kfree_skb(skb);
+				goto out_err;
+			}
+		}
 
 		if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
 			kfree_skb(skb);