diff mbox

[net] af_unix: don't poll dead peers

Message ID 1425499629-15223-1-git-send-email-minipli@googlemail.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Mathias Krause March 4, 2015, 8:07 p.m. UTC
We need to test if the peer is still alive, before actually calling
poll_wait() on its wait queue, as we may race with unix_release_sock()
which may lead to a use-after-free in the epoll code, ending up in
uninterruptable lockups in the epoll code or other badnesses.

The following program triggers the condition. The visible effects may
take some time to materialize.

,---[ ep.c ]---
| /* gcc -O2 -pthread -o ep ep.c */
| #include <sys/socket.h>
| #include <sys/epoll.h>
| #include <pthread.h>
| #include <unistd.h>
| #include <stdlib.h>
| #include <string.h>
| #include <stdio.h>
| #include <errno.h>
|
| static long fd_max;
| static int ep = -1;
|
| static int get_fd(void) {
|     int fd;
|
|     for (;;) {
|         fd = rand() % fd_max;
|
|         if (fd > 2 && fd != ep)
|             break;
|     }
|
|     return fd;
| }
|
|
| static void *opener(void *ptr) {
|     sleep(1);
|
|     for (;;) {
|         switch (rand() % 3) {
|             int fd[2];
|
|             case 0:
|                 socketpair(AF_UNIX, SOCK_DGRAM, 0, fd);
|                 break;
|
|             case 1:
|                 dup(get_fd());
|                 break;
|
|             case 2:
|                 write(get_fd(), ".", 1);
|                 break;
|         }
|
|         /* take a short nap when there are no more fds left */
|         if (errno == EMFILE || errno == ENFILE)
|             usleep(10);
|     }
|
|     return ptr;
| }
|
|
| static void *closer(void *ptr) {
|     int miss = 0;
|
|     sleep(1);
|
|     for (;;) {
|         switch (rand() % 5) {
|             default:
|                 close(get_fd());
|                 break;
|
|             case 4:
|                 usleep(1);
|                 break;
|         }
|
|         /* take a short nap when we're not hitting valid fds 10 times in
|          * a row so opener() can catch up */
|         if (errno == EBADF && ++miss >= 10) {
|             usleep(10);
|             miss = 0;
|         } else if (errno == 0) {
|             miss = 0;
|         }
|     }
|
|     return ptr;
| }
|
|
| static void *ep_add(void *ptr) {
|     sleep(1);
|
|     for (;;) {
|         int fd = get_fd();
|         struct epoll_event ev = {
|             .events = EPOLLIN | EPOLLOUT,
|             .data.fd = fd,
|         };
|
|         if (epoll_ctl(ep, EPOLL_CTL_ADD, fd, &ev) < 0 && errno == ENOSPC)
|             usleep(1);
|     }
|
|     return ptr;
| }
|
|
| static void *ep_del(void *ptr) {
|     sleep(1);
|
|     for (;;)
|         epoll_ctl(ep, EPOLL_CTL_DEL, get_fd(), NULL);
|
|     return ptr;
| }
|
|
| int main(void) {
|     pthread_t thread[4];
|     int i;
|
|     /* use epoll_create() to support older kernels */
|     ep = epoll_create(42);
|     if (ep < 0) {
|         fprintf(stderr, "err: epoll_create1() failed (%s)\n",
|                 strerror(errno));
|
|         return 1;
|     }
|
|     fd_max = sysconf(_SC_OPEN_MAX);
|     if (pthread_create(&thread[0], NULL, opener, NULL) ||
|         pthread_create(&thread[1], NULL, closer, NULL) ||
|         pthread_create(&thread[2], NULL, ep_add, NULL) ||
|         pthread_create(&thread[3], NULL, ep_del, NULL))
|     {
|         fprintf(stderr, "err: failed to start all threads!\n");
|
|         return 1;
|     }
|
|     /* XXX: pthread_cancel() all threads on termination request */
|
|     for (i = 0; i < 4; i++)
|         pthread_join(thread[i], NULL);
|
|     return 0;
| }
`-----

On a kernel with CONFIG_DEBUG_LIST and CONFIG_SLUB_DEBUG_ON set the
above program triggers the below splats within seconds:

[   10.660952] ------------[ cut here ]------------
[   10.664035] WARNING: CPU: 0 PID: 113 at /home/minipli/src/linux/lib/list_debug.c:59 __list_del_entry+0xa1/0xd0()
[   10.664035] list_del corruption. prev->next should be ffff88001fb98990, but was ffff88001f43fcd8
[   10.664035] Modules linked in:
[   10.664035] CPU: 0 PID: 113 Comm: ep Not tainted 4.0.0-rc2 #197
[   10.664035] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[   10.664035]  ffffffff814ebce0 ffff88001fb3fd48 ffffffff813ca3be 0000000000000000
[   10.683504]  ffff88001fb3fd98 ffff88001fb3fd88 ffffffff81044b1d ffff880000000000
[   10.683504]  ffff88001fb98990 ffff88001fb98978 0000000000000292 ffff88001f436e00
[   10.683504] Call Trace:
[   10.683504]  [<ffffffff813ca3be>] dump_stack+0x45/0x57
[   10.683504]  [<ffffffff81044b1d>] warn_slowpath_common+0x8d/0xd0
[   10.683504]  [<ffffffff81044c01>] warn_slowpath_fmt+0x41/0x50
[   10.683504]  [<ffffffff811c44e1>] __list_del_entry+0xa1/0xd0
[   10.683504]  [<ffffffff811c4521>] list_del+0x11/0x40
[   10.683504]  [<ffffffff8107279c>] remove_wait_queue+0x2c/0x50
[   10.683504]  [<ffffffff81133bba>] ep_unregister_pollwait.isra.12+0x3a/0x60
[   10.683504]  [<ffffffff81133c0a>] ep_remove+0x2a/0xd0
[   10.683504]  [<ffffffff81134f5e>] SyS_epoll_ctl+0x46e/0xcf0
[   10.683504]  [<ffffffff813cfd89>] ? _raw_spin_unlock_irq+0x9/0x10
[   10.683504]  [<ffffffff813d0192>] system_call_fastpath+0x12/0x17
[   10.683504] ---[ end trace 2a55662f515a4628 ]---
[   10.789959] BUG: spinlock bad magic on CPU#0, ep/113
[   10.792014] general protection fault: 0000 [#1] SMP
[   10.792014] Modules linked in:
[   10.792014] CPU: 0 PID: 113 Comm: ep Tainted: G        W       4.0.0-rc2 #197
[   10.792014] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[   10.792014] task: ffff88001f977500 ti: ffff88001fb3c000 task.ti: ffff88001fb3c000
[   10.792014] RIP: 0010:[<ffffffff813c8682>]  [<ffffffff813c8682>] spin_dump+0x55/0x90
[   10.792014] RSP: 0018:ffff88001fb3fd98  EFLAGS: 00010002
[   10.792014] RAX: 0000000000000028 RBX: 6b6b6b6b6b6b6b6b RCX: 0000000000000000
[   10.792014] RDX: 0000000000000001 RSI: ffff88001f00d1a8 RDI: ffff88001f00d1a8
[   10.792014] RBP: ffff88001fb3fda8 R08: 0000000000000000 R09: 000000006b6b6b6b
[   10.792014] R10: ffffffff81437e80 R11: 0000000000000000 R12: ffff88001fbe99c0
[   10.792014] R13: ffff88001fb48c40 R14: ffff88001fb48c00 R15: fffffffffffffffe
[   10.792014] FS:  00007f62532af700(0000) GS:ffff88001f000000(0000) knlGS:0000000000000000
[   10.792014] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[   10.792014] CR2: ffffffffff600400 CR3: 000000001f989000 CR4: 00000000000006f0
[   10.792014] Stack:
[   10.792014]  ffff88001fbe99c0 ffffffff814da8cb ffff88001fb3fdc8 ffffffff813c86de
[   10.792014]  ffff88001fbe99c0 ffff88001fb6eb08 ffff88001fb3fdf8 ffffffff810777f7
[   10.792014]  0000000000000292 ffff88001fb6eb08 ffff88001fb48c40 ffff88001fb48c00
[   10.792014] Call Trace:
[   10.792014]  [<ffffffff813c86de>] spin_bug+0x21/0x26
[   10.792014]  [<ffffffff810777f7>] do_raw_spin_lock+0x157/0x170
[   10.792014]  [<ffffffff813cfdd1>] _raw_spin_lock_irqsave+0x11/0x20
[   10.792014]  [<ffffffff8107278f>] remove_wait_queue+0x1f/0x50
[   10.792014]  [<ffffffff81133bba>] ep_unregister_pollwait.isra.12+0x3a/0x60
[   10.792014]  [<ffffffff81133c0a>] ep_remove+0x2a/0xd0
[   10.792014]  [<ffffffff81134f5e>] SyS_epoll_ctl+0x46e/0xcf0
[   10.792014]  [<ffffffff813cfd89>] ? _raw_spin_unlock_irq+0x9/0x10
[   10.792014]  [<ffffffff813d0192>] system_call_fastpath+0x12/0x17
[   10.792014] Code: 25 00 aa 00 00 44 8b 80 00 03 00 00 48 8d 88 a0 04 00 00 31 c0 65 8b 15 95 1a c4 7e e8 bd 00 00 00 48 85 db 45 8b 4c 24 08 74 10 <44> 8b 83 00 03 00 00 48 8d 8b a0 04 00 00 eb 0b 41 83 c8 ff 48
[   10.792014] RIP  [<ffffffff813c8682>] spin_dump+0x55/0x90
[   10.792014]  RSP <ffff88001fb3fd98>
[   10.792014] ---[ end trace 2a55662f515a462a ]---

Those look similar to [1] and [2] so might be the cause of these as
well.

[1] https://lkml.org/lkml/2013/10/14/353
[2] https://lkml.org/lkml/2014/5/15/532

The backtraces are misleading, as the bug is, in fact, not in the epoll
code but in af_unix. A 'git bisect' run confirmed that and identified
commit 3c73419c09a5 ("af_unix: fix 'poll for write'/ connected DGRAM
sockets") as the root cause.

This bug has been found by the PaX memory sanitize feature, reported
here: https://forums.grsecurity.net/viewtopic.php?f=3&t=4150

Fixes: 3c73419c09a5 "af_unix: fix 'poll for write'/ connected DGRAM..."
Signed-off-by: Mathias Krause <minipli@googlemail.com>
Reported-by: Olivier Mauras <olivier@mauras.ch>
Cc: Rainer Weikusat <rweikusat@mssgmbh.com>
Cc: PaX Team <pageexec@freemail.hu>
---
 net/unix/af_unix.c |    6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

Comments

Eric Dumazet March 4, 2015, 8:21 p.m. UTC | #1
On Wed, 2015-03-04 at 21:07 +0100, Mathias Krause wrote:
> We need to test if the peer is still alive, before actually calling
> poll_wait() on its wait queue, as we may race with unix_release_sock()
> which may lead to a use-after-free in the epoll code, ending up in
> uninterruptable lockups in the epoll code or other badnesses.

...

> 
> This bug has been found by the PaX memory sanitize feature, reported
> here: https://forums.grsecurity.net/viewtopic.php?f=3&t=4150
> 
> Fixes: 3c73419c09a5 "af_unix: fix 'poll for write'/ connected DGRAM..."
> Signed-off-by: Mathias Krause <minipli@googlemail.com>
> Reported-by: Olivier Mauras <olivier@mauras.ch>
> Cc: Rainer Weikusat <rweikusat@mssgmbh.com>
> Cc: PaX Team <pageexec@freemail.hu>
> ---
>  net/unix/af_unix.c |    6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 526b6edab018..0da9d7eeed17 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -2233,10 +2233,14 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
>  	writable = unix_writable(sk);
>  	other = unix_peer_get(sk);
>  	if (other) {
> -		if (unix_peer(other) != sk) {
> +		unix_state_lock(other);
> +		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) != sk) {
> +			unix_state_unlock(other);
>  			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
>  			if (unix_recvq_full(other))
>  				writable = 0;
> +		} else {
> +			unix_state_unlock(other);
>  		}
>  		sock_put(other);
>  	}


Using locks in _poll() is going to be tremendously expensive for some
applications still using poll() or select() ?

This might be find for a stable candidate, but...

It seems RCU locking might be more appropriate in this fast path.

Please take a look at struct socket" and its "struct socket_wq __rcu
*wq;"

(commit 43815482370c510c569fd18edb57afcb0fa8cab6 for details)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mathias Krause March 4, 2015, 8:39 p.m. UTC | #2
On 4 March 2015 at 21:21, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2015-03-04 at 21:07 +0100, Mathias Krause wrote:
>> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
>> index 526b6edab018..0da9d7eeed17 100644
>> --- a/net/unix/af_unix.c
>> +++ b/net/unix/af_unix.c
>> @@ -2233,10 +2233,14 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
>>       writable = unix_writable(sk);
>>       other = unix_peer_get(sk);
>>       if (other) {
>> -             if (unix_peer(other) != sk) {
>> +             unix_state_lock(other);
>> +             if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) != sk) {
>> +                     unix_state_unlock(other);
>>                       sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
>>                       if (unix_recvq_full(other))
>>                               writable = 0;
>> +             } else {
>> +                     unix_state_unlock(other);
>>               }
>>               sock_put(other);
>>       }
>
>
> Using locks in _poll() is going to be tremendously expensive for some
> applications still using poll() or select() ?

I've no idea but we're already taking this very lock in the
unix_peer_get() call a few lines above.
My tests survived ~12 hours of stress testing without taking the lock,
too. But I'm not at all familiar with the locking semantics here, but
quite a few other locations in af_unix.c take the lock prior testing
the flag.
I'm happy either way -- with or without lock.

>
> This might be find for a stable candidate, but...
>I I
> It seems RCU locking might be more appropriate in this fast path.
>
> Please take a look at struct socket" and its "struct socket_wq __rcu
> *wq;"
>
> (commit 43815482370c510c569fd18edb57afcb0fa8cab6 for details)

I guess that would require more intrusive rewrites of the code I'm,
unfortunately, incapable to do right now. Maybe you are?

Thanks,
Mathias
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 4, 2015, 9:02 p.m. UTC | #3
On Wed, 2015-03-04 at 21:39 +0100, Mathias Krause wrote:
> On 4 March 2015 at 21:21, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > Using locks in _poll() is going to be tremendously expensive for some
> > applications still using poll() or select() ?
> 
> I've no idea but we're already taking this very lock in the
> unix_peer_get() call a few lines above.

This is not the same lock.

other != sk



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mathias Krause March 4, 2015, 9:26 p.m. UTC | #4
[ Removed Rainer from the Cc list, his address is bouncing. ]

On 4 March 2015 at 22:02, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2015-03-04 at 21:39 +0100, Mathias Krause wrote:
>> On 4 March 2015 at 21:21, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> >
>> > Using locks in _poll() is going to be tremendously expensive for some
>> > applications still using poll() or select() ?
>>
>> I've no idea but we're already taking this very lock in the
>> unix_peer_get() call a few lines above.
>
> This is not the same lock.
>
> other != sk

True. So, do you propose to not take the lock at all?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mathias Krause March 25, 2015, 9:33 p.m. UTC | #5
On 4 March 2015 at 22:26, Mathias Krause <minipli@googlemail.com> wrote:
> [ Removed Rainer from the Cc list, his address is bouncing. ]
>
> On 4 March 2015 at 22:02, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> On Wed, 2015-03-04 at 21:39 +0100, Mathias Krause wrote:
>>> On 4 March 2015 at 21:21, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>> >
>>> > Using locks in _poll() is going to be tremendously expensive for some
>>> > applications still using poll() or select() ?
>>>
>>> I've no idea but we're already taking this very lock in the
>>> unix_peer_get() call a few lines above.
>>
>> This is not the same lock.
>>
>> other != sk
>
> True. So, do you propose to not take the lock at all?

Ping.

Eric, do you think it's safe to not take the lock prior testing the flag?

Mathias
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 526b6edab018..0da9d7eeed17 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2233,10 +2233,14 @@  static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 	writable = unix_writable(sk);
 	other = unix_peer_get(sk);
 	if (other) {
-		if (unix_peer(other) != sk) {
+		unix_state_lock(other);
+		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) != sk) {
+			unix_state_unlock(other);
 			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
 			if (unix_recvq_full(other))
 				writable = 0;
+		} else {
+			unix_state_unlock(other);
 		}
 		sock_put(other);
 	}