diff mbox

sctp: fix ASCONF list handling

Message ID d78e23c156981e761086f4e37517e441c84d6f90.1432769183.git.marcelo.leitner@gmail.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Marcelo Leitner May 28, 2015, 12:52 a.m. UTC
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

->auto_asconf_splist is per namespace and mangled by functions like
sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.

Also, the call to inet_sk_copy_descendant() was backuping
->auto_asconf_list through the copy but was not honoring
->do_auto_asconf, which could lead to list corruption if it was
different between both sockets.

This commit thus fixes the list handling by adding a spinlock to protect
against multiple writers and converts the list to be protected by RCU
too, so that we don't have a lock inverstion issue at
sctp_addr_wq_timeout_handler().

And as this list now uses RCU, we cannot do such backup and restore
while copying descendant data anymore as readers may be traversing the
list meanwhile. We fix this by simply ignoring/not copying those fields,
placed at the end of struct sctp_sock, so we can just ignore it together
with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
so we don't clutter inet_sk_copy_descendant() with SCTP info.

Issue was found with a test application that kept flipping sysctl
default_auto_asconf on and off.

Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 include/net/netns/sctp.h   |  6 +++++-
 include/net/sctp/structs.h |  2 ++
 net/sctp/protocol.c        |  6 +++++-
 net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
 4 files changed, 38 insertions(+), 15 deletions(-)

Comments

Neil Horman May 29, 2015, 1:17 p.m. UTC | #1
On Thu, May 28, 2015 at 11:46:29AM -0300, Marcelo Ricardo Leitner wrote:
> On Thu, May 28, 2015 at 10:27:32AM -0300, Marcelo Ricardo Leitner wrote:
> > On Thu, May 28, 2015 at 08:17:27AM -0300, Marcelo Ricardo Leitner wrote:
> > > On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> > > > On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> > > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > 
> > > > > ->auto_asconf_splist is per namespace and mangled by functions like
> > > > > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > > > > 
> > > > > Also, the call to inet_sk_copy_descendant() was backuping
> > > > > ->auto_asconf_list through the copy but was not honoring
> > > > > ->do_auto_asconf, which could lead to list corruption if it was
> > > > > different between both sockets.
> > > > > 
> > > > > This commit thus fixes the list handling by adding a spinlock to protect
> > > > > against multiple writers and converts the list to be protected by RCU
> > > > > too, so that we don't have a lock inverstion issue at
> > > > > sctp_addr_wq_timeout_handler().
> > > > > 
> > > > > And as this list now uses RCU, we cannot do such backup and restore
> > > > > while copying descendant data anymore as readers may be traversing the
> > > > > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > > > > placed at the end of struct sctp_sock, so we can just ignore it together
> > > > > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > > > > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > > > > 
> > > > > Issue was found with a test application that kept flipping sysctl
> > > > > default_auto_asconf on and off.
> > > > > 
> > > > > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > > > > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > ---
> > > > >  include/net/netns/sctp.h   |  6 +++++-
> > > > >  include/net/sctp/structs.h |  2 ++
> > > > >  net/sctp/protocol.c        |  6 +++++-
> > > > >  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
> > > > >  4 files changed, 38 insertions(+), 15 deletions(-)
> > > > > 
> > > > > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > > > > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> > > > > --- a/include/net/netns/sctp.h
> > > > > +++ b/include/net/netns/sctp.h
> > > > > @@ -30,12 +30,15 @@ struct netns_sctp {
> > > > >  	struct list_head local_addr_list;
> > > > >  	struct list_head addr_waitq;
> > > > >  	struct timer_list addr_wq_timer;
> > > > > -	struct list_head auto_asconf_splist;
> > > > > +	struct list_head __rcu auto_asconf_splist;
> > > > You should use the addr_wq_lock here instead of creating a new lock, as thats
> > > > already used to protect most accesses to the list you are concerned about.
> > > 
> > > Ok, that works too.
> > > 
> > > > Though truthfully, that shouldn't be necessecary.  The list in question is only
> > > > read in one location and only written in one location.  You can likely just
> > > > rcu-ify, as the write side is in process context and protected by lock_sock.
> > > 
> > > It should, it's not protected by lock_sock as this list resides in
> > > netns_sctp structure, which lock_sock doesn't cover. Write side is in
> > > process context yes, but this list is written in sctp_init_sock(),
> > > sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
> > > trigger this by either creating/destroying sockets if
> > > default_auto_asconf=1 or just by creating a bunch of sockets and
> > > flipping asconf via setsockopt (or a combination of these operations).
> > > (I'll point this out in the changelog)
> > 
> > Hmm.. by reusing addr_wq_lock we don't need to rcu-ify the list, as the
> > reader is inside that lock too, so I can just protect auto_asconf_splist
> > writers with addr_wq_lock.
> > 
> > Nice, thanks Neil.
> 
> Cannot really do that.. as that creates a lock inversion between
> sctp_destroy_sock() (which already holds lock_sock) and
> sctp_addr_wq_timeout_handler(), which first grabs addr_wq_lock and then
> locks socket by socket.
> 
> Due to that, I'm afraid reusing this lock is not possible, and we should
> stick with the patch.. what do you think? (though I have to fix the nits
> in there)
> 
I don't think thats accurate.  You are correct in that the the locks are taken
in opposing order, which would imply a lock inversion that could result in
deadlock, but we can avoid that by deferring the asconf list removal until after
sk_common_release and unlock_sock_bh is called in sctp_close.  That will make
the lock ordering consistent.  Alternatively, we can pre-emptively take the
asconf_lock in sctp_close before locking the socket.

I'd really rather avoid creating an additional lock here if we don't have to
Neil

>   Marcelo
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Leitner May 29, 2015, 4:50 p.m. UTC | #2
On Fri, May 29, 2015 at 09:17:26AM -0400, Neil Horman wrote:
> On Thu, May 28, 2015 at 11:46:29AM -0300, Marcelo Ricardo Leitner wrote:
> > On Thu, May 28, 2015 at 10:27:32AM -0300, Marcelo Ricardo Leitner wrote:
> > > On Thu, May 28, 2015 at 08:17:27AM -0300, Marcelo Ricardo Leitner wrote:
> > > > On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> > > > > On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> > > > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > >
> > > > > > ->auto_asconf_splist is per namespace and mangled by functions like
> > > > > > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > > > > >
> > > > > > Also, the call to inet_sk_copy_descendant() was backuping
> > > > > > ->auto_asconf_list through the copy but was not honoring
> > > > > > ->do_auto_asconf, which could lead to list corruption if it was
> > > > > > different between both sockets.
> > > > > >
> > > > > > This commit thus fixes the list handling by adding a spinlock to protect
> > > > > > against multiple writers and converts the list to be protected by RCU
> > > > > > too, so that we don't have a lock inverstion issue at
> > > > > > sctp_addr_wq_timeout_handler().
> > > > > >
> > > > > > And as this list now uses RCU, we cannot do such backup and restore
> > > > > > while copying descendant data anymore as readers may be traversing the
> > > > > > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > > > > > placed at the end of struct sctp_sock, so we can just ignore it together
> > > > > > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > > > > > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > > > > >
> > > > > > Issue was found with a test application that kept flipping sysctl
> > > > > > default_auto_asconf on and off.
> > > > > >
> > > > > > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > > > > > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > ---
> > > > > >  include/net/netns/sctp.h   |  6 +++++-
> > > > > >  include/net/sctp/structs.h |  2 ++
> > > > > >  net/sctp/protocol.c        |  6 +++++-
> > > > > >  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
> > > > > >  4 files changed, 38 insertions(+), 15 deletions(-)
> > > > > >
> > > > > > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > > > > > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> > > > > > --- a/include/net/netns/sctp.h
> > > > > > +++ b/include/net/netns/sctp.h
> > > > > > @@ -30,12 +30,15 @@ struct netns_sctp {
> > > > > >  	struct list_head local_addr_list;
> > > > > >  	struct list_head addr_waitq;
> > > > > >  	struct timer_list addr_wq_timer;
> > > > > > -	struct list_head auto_asconf_splist;
> > > > > > +	struct list_head __rcu auto_asconf_splist;
> > > > > You should use the addr_wq_lock here instead of creating a new lock, as thats
> > > > > already used to protect most accesses to the list you are concerned about.
> > > >
> > > > Ok, that works too.
> > > >
> > > > > Though truthfully, that shouldn't be necessecary.  The list in question is only
> > > > > read in one location and only written in one location.  You can likely just
> > > > > rcu-ify, as the write side is in process context and protected by lock_sock.
> > > >
> > > > It should, it's not protected by lock_sock as this list resides in
> > > > netns_sctp structure, which lock_sock doesn't cover. Write side is in
> > > > process context yes, but this list is written in sctp_init_sock(),
> > > > sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
> > > > trigger this by either creating/destroying sockets if
> > > > default_auto_asconf=1 or just by creating a bunch of sockets and
> > > > flipping asconf via setsockopt (or a combination of these operations).
> > > > (I'll point this out in the changelog)
> > >
> > > Hmm.. by reusing addr_wq_lock we don't need to rcu-ify the list, as the
> > > reader is inside that lock too, so I can just protect auto_asconf_splist
> > > writers with addr_wq_lock.
> > >
> > > Nice, thanks Neil.
> >
> > Cannot really do that.. as that creates a lock inversion between
> > sctp_destroy_sock() (which already holds lock_sock) and
> > sctp_addr_wq_timeout_handler(), which first grabs addr_wq_lock and then
> > locks socket by socket.
> >
> > Due to that, I'm afraid reusing this lock is not possible, and we should
> > stick with the patch.. what do you think? (though I have to fix the nits
> > in there)
> >
> I don't think thats accurate.  You are correct in that the the locks are taken
> in opposing order, which would imply a lock inversion that could result in
> deadlock, but we can avoid that by deferring the asconf list removal until after
> sk_common_release and unlock_sock_bh is called in sctp_close.  That will make
> the lock ordering consistent.  Alternatively, we can pre-emptively take the
> asconf_lock in sctp_close before locking the socket.

For your first approach, deferring the asconf list removal, we can only
do that reliably via some work queue, because we initialize asconf stuff
on sctp_init_sock() and it should be de-initialized on its counterpart,
sctp_destroy_sock(), as we have code like:

(same for ipv4)
sctp_v6_create_accept_sk()
{
...
        if (newsk->sk_prot->init(newsk)) {
                sk_common_release(newsk);
                newsk = NULL;
        }
...
}

and at inet6_create() too:
        if (sk->sk_prot->init) {
                err = sk->sk_prot->init(sk);
                if (err)
                        sk_common_release(sk);
        }

Or we (kind of) abuse of knowing that sctp_init_sock() cannot fail after
initializing asconf and move asconf stuff from sctp_destroy_sock() to
sctp_close(). AFAICT it could be enough, I'm just not a big fan of not
having that similarity.

If we try to lock addr_wq_lock early in sctp_close(),
sctp_destroy_sock() would be unprotected on above situations, but if we
know that sctp_init_sock() won't fail after initilizating asconf, it
wouldn't be a problem...

> I'd really rather avoid creating an additional lock here if we don't have to

I understand. I'm just not seeing another way out so far... I'll keep
trying, but please I'm all ears to ideas ;)

  Marcelo

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman June 1, 2015, 2 p.m. UTC | #3
On Fri, May 29, 2015 at 01:50:37PM -0300, Marcelo Ricardo Leitner wrote:
> On Fri, May 29, 2015 at 09:17:26AM -0400, Neil Horman wrote:
> > On Thu, May 28, 2015 at 11:46:29AM -0300, Marcelo Ricardo Leitner wrote:
> > > On Thu, May 28, 2015 at 10:27:32AM -0300, Marcelo Ricardo Leitner wrote:
> > > > On Thu, May 28, 2015 at 08:17:27AM -0300, Marcelo Ricardo Leitner wrote:
> > > > > On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> > > > > > On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> > > > > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > >
> > > > > > > ->auto_asconf_splist is per namespace and mangled by functions like
> > > > > > > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > > > > > >
> > > > > > > Also, the call to inet_sk_copy_descendant() was backuping
> > > > > > > ->auto_asconf_list through the copy but was not honoring
> > > > > > > ->do_auto_asconf, which could lead to list corruption if it was
> > > > > > > different between both sockets.
> > > > > > >
> > > > > > > This commit thus fixes the list handling by adding a spinlock to protect
> > > > > > > against multiple writers and converts the list to be protected by RCU
> > > > > > > too, so that we don't have a lock inverstion issue at
> > > > > > > sctp_addr_wq_timeout_handler().
> > > > > > >
> > > > > > > And as this list now uses RCU, we cannot do such backup and restore
> > > > > > > while copying descendant data anymore as readers may be traversing the
> > > > > > > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > > > > > > placed at the end of struct sctp_sock, so we can just ignore it together
> > > > > > > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > > > > > > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > > > > > >
> > > > > > > Issue was found with a test application that kept flipping sysctl
> > > > > > > default_auto_asconf on and off.
> > > > > > >
> > > > > > > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > > > > > > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > > ---
> > > > > > >  include/net/netns/sctp.h   |  6 +++++-
> > > > > > >  include/net/sctp/structs.h |  2 ++
> > > > > > >  net/sctp/protocol.c        |  6 +++++-
> > > > > > >  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
> > > > > > >  4 files changed, 38 insertions(+), 15 deletions(-)
> > > > > > >
> > > > > > > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > > > > > > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> > > > > > > --- a/include/net/netns/sctp.h
> > > > > > > +++ b/include/net/netns/sctp.h
> > > > > > > @@ -30,12 +30,15 @@ struct netns_sctp {
> > > > > > >  	struct list_head local_addr_list;
> > > > > > >  	struct list_head addr_waitq;
> > > > > > >  	struct timer_list addr_wq_timer;
> > > > > > > -	struct list_head auto_asconf_splist;
> > > > > > > +	struct list_head __rcu auto_asconf_splist;
> > > > > > You should use the addr_wq_lock here instead of creating a new lock, as thats
> > > > > > already used to protect most accesses to the list you are concerned about.
> > > > >
> > > > > Ok, that works too.
> > > > >
> > > > > > Though truthfully, that shouldn't be necessecary.  The list in question is only
> > > > > > read in one location and only written in one location.  You can likely just
> > > > > > rcu-ify, as the write side is in process context and protected by lock_sock.
> > > > >
> > > > > It should, it's not protected by lock_sock as this list resides in
> > > > > netns_sctp structure, which lock_sock doesn't cover. Write side is in
> > > > > process context yes, but this list is written in sctp_init_sock(),
> > > > > sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
> > > > > trigger this by either creating/destroying sockets if
> > > > > default_auto_asconf=1 or just by creating a bunch of sockets and
> > > > > flipping asconf via setsockopt (or a combination of these operations).
> > > > > (I'll point this out in the changelog)
> > > >
> > > > Hmm.. by reusing addr_wq_lock we don't need to rcu-ify the list, as the
> > > > reader is inside that lock too, so I can just protect auto_asconf_splist
> > > > writers with addr_wq_lock.
> > > >
> > > > Nice, thanks Neil.
> > >
> > > Cannot really do that.. as that creates a lock inversion between
> > > sctp_destroy_sock() (which already holds lock_sock) and
> > > sctp_addr_wq_timeout_handler(), which first grabs addr_wq_lock and then
> > > locks socket by socket.
> > >
> > > Due to that, I'm afraid reusing this lock is not possible, and we should
> > > stick with the patch.. what do you think? (though I have to fix the nits
> > > in there)
> > >
> > I don't think thats accurate.  You are correct in that the the locks are taken
> > in opposing order, which would imply a lock inversion that could result in
> > deadlock, but we can avoid that by deferring the asconf list removal until after
> > sk_common_release and unlock_sock_bh is called in sctp_close.  That will make
> > the lock ordering consistent.  Alternatively, we can pre-emptively take the
> > asconf_lock in sctp_close before locking the socket.
> 
> For your first approach, deferring the asconf list removal, we can only
> do that reliably via some work queue, because we initialize asconf stuff
> on sctp_init_sock() and it should be de-initialized on its counterpart,
> sctp_destroy_sock(), as we have code like:
> 
> (same for ipv4)
> sctp_v6_create_accept_sk()
> {
> ...
>         if (newsk->sk_prot->init(newsk)) {
>                 sk_common_release(newsk);
>                 newsk = NULL;
>         }
> ...
> }
> 
> and at inet6_create() too:
>         if (sk->sk_prot->init) {
>                 err = sk->sk_prot->init(sk);
>                 if (err)
>                         sk_common_release(sk);
>         }
> 
> Or we (kind of) abuse of knowing that sctp_init_sock() cannot fail after
> initializing asconf and move asconf stuff from sctp_destroy_sock() to
> sctp_close(). AFAICT it could be enough, I'm just not a big fan of not
> having that similarity.
> 
> If we try to lock addr_wq_lock early in sctp_close(),
> sctp_destroy_sock() would be unprotected on above situations, but if we
> know that sctp_init_sock() won't fail after initilizating asconf, it
> wouldn't be a problem...
> 
> > I'd really rather avoid creating an additional lock here if we don't have to
> 
> I understand. I'm just not seeing another way out so far... I'll keep
> trying, but please I'm all ears to ideas ;)
> 
>   Marcelo
rcu-ify auto_asconf_splist, then just use the addr_wq_lock to protect the list
modification sites, as those are all handled at locations that already hold a
socket lock.  sctp_addr_wq_timeout_handler is a read-only function for
auto_asconf_splist, and so the spin_lock_bh there can be replaced with an
rcu_read_lock_bh call, breaking the lock inversion
Neil

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Leitner June 1, 2015, 10:28 p.m. UTC | #4
On Mon, Jun 01, 2015 at 10:00:28AM -0400, Neil Horman wrote:
> On Fri, May 29, 2015 at 01:50:37PM -0300, Marcelo Ricardo Leitner wrote:
> > On Fri, May 29, 2015 at 09:17:26AM -0400, Neil Horman wrote:
> > > On Thu, May 28, 2015 at 11:46:29AM -0300, Marcelo Ricardo Leitner wrote:
> > > > On Thu, May 28, 2015 at 10:27:32AM -0300, Marcelo Ricardo Leitner wrote:
> > > > > On Thu, May 28, 2015 at 08:17:27AM -0300, Marcelo Ricardo Leitner wrote:
> > > > > > On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> > > > > > > On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> > > > > > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > > >
> > > > > > > > ->auto_asconf_splist is per namespace and mangled by functions like
> > > > > > > > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > > > > > > >
> > > > > > > > Also, the call to inet_sk_copy_descendant() was backuping
> > > > > > > > ->auto_asconf_list through the copy but was not honoring
> > > > > > > > ->do_auto_asconf, which could lead to list corruption if it was
> > > > > > > > different between both sockets.
> > > > > > > >
> > > > > > > > This commit thus fixes the list handling by adding a spinlock to protect
> > > > > > > > against multiple writers and converts the list to be protected by RCU
> > > > > > > > too, so that we don't have a lock inverstion issue at
> > > > > > > > sctp_addr_wq_timeout_handler().
> > > > > > > >
> > > > > > > > And as this list now uses RCU, we cannot do such backup and restore
> > > > > > > > while copying descendant data anymore as readers may be traversing the
> > > > > > > > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > > > > > > > placed at the end of struct sctp_sock, so we can just ignore it together
> > > > > > > > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > > > > > > > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > > > > > > >
> > > > > > > > Issue was found with a test application that kept flipping sysctl
> > > > > > > > default_auto_asconf on and off.
> > > > > > > >
> > > > > > > > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > > > > > > > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > > > ---
> > > > > > > >  include/net/netns/sctp.h   |  6 +++++-
> > > > > > > >  include/net/sctp/structs.h |  2 ++
> > > > > > > >  net/sctp/protocol.c        |  6 +++++-
> > > > > > > >  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
> > > > > > > >  4 files changed, 38 insertions(+), 15 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > > > > > > > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> > > > > > > > --- a/include/net/netns/sctp.h
> > > > > > > > +++ b/include/net/netns/sctp.h
> > > > > > > > @@ -30,12 +30,15 @@ struct netns_sctp {
> > > > > > > >  	struct list_head local_addr_list;
> > > > > > > >  	struct list_head addr_waitq;
> > > > > > > >  	struct timer_list addr_wq_timer;
> > > > > > > > -	struct list_head auto_asconf_splist;
> > > > > > > > +	struct list_head __rcu auto_asconf_splist;
> > > > > > > You should use the addr_wq_lock here instead of creating a new lock, as thats
> > > > > > > already used to protect most accesses to the list you are concerned about.
> > > > > >
> > > > > > Ok, that works too.
> > > > > >
> > > > > > > Though truthfully, that shouldn't be necessecary.  The list in question is only
> > > > > > > read in one location and only written in one location.  You can likely just
> > > > > > > rcu-ify, as the write side is in process context and protected by lock_sock.
> > > > > >
> > > > > > It should, it's not protected by lock_sock as this list resides in
> > > > > > netns_sctp structure, which lock_sock doesn't cover. Write side is in
> > > > > > process context yes, but this list is written in sctp_init_sock(),
> > > > > > sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
> > > > > > trigger this by either creating/destroying sockets if
> > > > > > default_auto_asconf=1 or just by creating a bunch of sockets and
> > > > > > flipping asconf via setsockopt (or a combination of these operations).
> > > > > > (I'll point this out in the changelog)
> > > > >
> > > > > Hmm.. by reusing addr_wq_lock we don't need to rcu-ify the list, as the
> > > > > reader is inside that lock too, so I can just protect auto_asconf_splist
> > > > > writers with addr_wq_lock.
> > > > >
> > > > > Nice, thanks Neil.
> > > >
> > > > Cannot really do that.. as that creates a lock inversion between
> > > > sctp_destroy_sock() (which already holds lock_sock) and
> > > > sctp_addr_wq_timeout_handler(), which first grabs addr_wq_lock and then
> > > > locks socket by socket.
> > > >
> > > > Due to that, I'm afraid reusing this lock is not possible, and we should
> > > > stick with the patch.. what do you think? (though I have to fix the nits
> > > > in there)
> > > >
> > > I don't think thats accurate.  You are correct in that the the locks are taken
> > > in opposing order, which would imply a lock inversion that could result in
> > > deadlock, but we can avoid that by deferring the asconf list removal until after
> > > sk_common_release and unlock_sock_bh is called in sctp_close.  That will make
> > > the lock ordering consistent.  Alternatively, we can pre-emptively take the
> > > asconf_lock in sctp_close before locking the socket.
> > 
> > For your first approach, deferring the asconf list removal, we can only
> > do that reliably via some work queue, because we initialize asconf stuff
> > on sctp_init_sock() and it should be de-initialized on its counterpart,
> > sctp_destroy_sock(), as we have code like:
> > 
> > (same for ipv4)
> > sctp_v6_create_accept_sk()
> > {
> > ...
> >         if (newsk->sk_prot->init(newsk)) {
> >                 sk_common_release(newsk);
> >                 newsk = NULL;
> >         }
> > ...
> > }
> > 
> > and at inet6_create() too:
> >         if (sk->sk_prot->init) {
> >                 err = sk->sk_prot->init(sk);
> >                 if (err)
> >                         sk_common_release(sk);
> >         }
> > 
> > Or we (kind of) abuse of knowing that sctp_init_sock() cannot fail after
> > initializing asconf and move asconf stuff from sctp_destroy_sock() to
> > sctp_close(). AFAICT it could be enough, I'm just not a big fan of not
> > having that similarity.
> > 
> > If we try to lock addr_wq_lock early in sctp_close(),
> > sctp_destroy_sock() would be unprotected on above situations, but if we
> > know that sctp_init_sock() won't fail after initilizating asconf, it
> > wouldn't be a problem...
> > 
> > > I'd really rather avoid creating an additional lock here if we don't have to
> > 
> > I understand. I'm just not seeing another way out so far... I'll keep
> > trying, but please I'm all ears to ideas ;)
> > 
> >   Marcelo
> rcu-ify auto_asconf_splist, then just use the addr_wq_lock to protect the list
> modification sites, as those are all handled at locations that already hold a
> socket lock.  sctp_addr_wq_timeout_handler is a read-only function for
> auto_asconf_splist, and so the spin_lock_bh there can be replaced with an
> rcu_read_lock_bh call, breaking the lock inversion
> Neil

That's still the lock inversion, because at
sctp_addr_wq_timeout_handler() we are not grabbing addr_wq_lock just for
this patch, it's already there and later inside that for_each it will
lock socket by socket; while on sctp_destroy_sock() , it is called with
socket lock already held and we would be trying to grab ->addr_wq_lock
in order to protect the list change.

We could rcu-ify addr_waitq too. Let sctp_addr_wq_timeout_handler() work
on the list protected by rcu_read_lock_bh() and only at the bottom of
that for() we grab addr_wq_lock and do the list change. Would require
more changes than just that in order to work properly (and avoid
double-frees through that timer and sctp_free_addr_wq() for example)

  Marcelo

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman June 2, 2015, 1:48 p.m. UTC | #5
On Mon, Jun 01, 2015 at 07:28:21PM -0300, Marcelo Ricardo Leitner wrote:
> On Mon, Jun 01, 2015 at 10:00:28AM -0400, Neil Horman wrote:
> > On Fri, May 29, 2015 at 01:50:37PM -0300, Marcelo Ricardo Leitner wrote:
> > > On Fri, May 29, 2015 at 09:17:26AM -0400, Neil Horman wrote:
> > > > On Thu, May 28, 2015 at 11:46:29AM -0300, Marcelo Ricardo Leitner wrote:
> > > > > On Thu, May 28, 2015 at 10:27:32AM -0300, Marcelo Ricardo Leitner wrote:
> > > > > > On Thu, May 28, 2015 at 08:17:27AM -0300, Marcelo Ricardo Leitner wrote:
> > > > > > > On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> > > > > > > > On Wed, May 27, 2015 at 09:52:17PM -0300, mleitner@redhat.com wrote:
> > > > > > > > > From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > > > >
> > > > > > > > > ->auto_asconf_splist is per namespace and mangled by functions like
> > > > > > > > > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > > > > > > > >
> > > > > > > > > Also, the call to inet_sk_copy_descendant() was backuping
> > > > > > > > > ->auto_asconf_list through the copy but was not honoring
> > > > > > > > > ->do_auto_asconf, which could lead to list corruption if it was
> > > > > > > > > different between both sockets.
> > > > > > > > >
> > > > > > > > > This commit thus fixes the list handling by adding a spinlock to protect
> > > > > > > > > against multiple writers and converts the list to be protected by RCU
> > > > > > > > > too, so that we don't have a lock inverstion issue at
> > > > > > > > > sctp_addr_wq_timeout_handler().
> > > > > > > > >
> > > > > > > > > And as this list now uses RCU, we cannot do such backup and restore
> > > > > > > > > while copying descendant data anymore as readers may be traversing the
> > > > > > > > > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > > > > > > > > placed at the end of struct sctp_sock, so we can just ignore it together
> > > > > > > > > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > > > > > > > > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > > > > > > > >
> > > > > > > > > Issue was found with a test application that kept flipping sysctl
> > > > > > > > > default_auto_asconf on and off.
> > > > > > > > >
> > > > > > > > > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > > > > > > > > Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > > > > > > > ---
> > > > > > > > >  include/net/netns/sctp.h   |  6 +++++-
> > > > > > > > >  include/net/sctp/structs.h |  2 ++
> > > > > > > > >  net/sctp/protocol.c        |  6 +++++-
> > > > > > > > >  net/sctp/socket.c          | 39 ++++++++++++++++++++++++++-------------
> > > > > > > > >  4 files changed, 38 insertions(+), 15 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > > > > > > > > index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
> > > > > > > > > --- a/include/net/netns/sctp.h
> > > > > > > > > +++ b/include/net/netns/sctp.h
> > > > > > > > > @@ -30,12 +30,15 @@ struct netns_sctp {
> > > > > > > > >  	struct list_head local_addr_list;
> > > > > > > > >  	struct list_head addr_waitq;
> > > > > > > > >  	struct timer_list addr_wq_timer;
> > > > > > > > > -	struct list_head auto_asconf_splist;
> > > > > > > > > +	struct list_head __rcu auto_asconf_splist;
> > > > > > > > You should use the addr_wq_lock here instead of creating a new lock, as thats
> > > > > > > > already used to protect most accesses to the list you are concerned about.
> > > > > > >
> > > > > > > Ok, that works too.
> > > > > > >
> > > > > > > > Though truthfully, that shouldn't be necessecary.  The list in question is only
> > > > > > > > read in one location and only written in one location.  You can likely just
> > > > > > > > rcu-ify, as the write side is in process context and protected by lock_sock.
> > > > > > >
> > > > > > > It should, it's not protected by lock_sock as this list resides in
> > > > > > > netns_sctp structure, which lock_sock doesn't cover. Write side is in
> > > > > > > process context yes, but this list is written in sctp_init_sock(),
> > > > > > > sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
> > > > > > > trigger this by either creating/destroying sockets if
> > > > > > > default_auto_asconf=1 or just by creating a bunch of sockets and
> > > > > > > flipping asconf via setsockopt (or a combination of these operations).
> > > > > > > (I'll point this out in the changelog)
> > > > > >
> > > > > > Hmm.. by reusing addr_wq_lock we don't need to rcu-ify the list, as the
> > > > > > reader is inside that lock too, so I can just protect auto_asconf_splist
> > > > > > writers with addr_wq_lock.
> > > > > >
> > > > > > Nice, thanks Neil.
> > > > >
> > > > > Cannot really do that.. as that creates a lock inversion between
> > > > > sctp_destroy_sock() (which already holds lock_sock) and
> > > > > sctp_addr_wq_timeout_handler(), which first grabs addr_wq_lock and then
> > > > > locks socket by socket.
> > > > >
> > > > > Due to that, I'm afraid reusing this lock is not possible, and we should
> > > > > stick with the patch.. what do you think? (though I have to fix the nits
> > > > > in there)
> > > > >
> > > > I don't think thats accurate.  You are correct in that the the locks are taken
> > > > in opposing order, which would imply a lock inversion that could result in
> > > > deadlock, but we can avoid that by deferring the asconf list removal until after
> > > > sk_common_release and unlock_sock_bh is called in sctp_close.  That will make
> > > > the lock ordering consistent.  Alternatively, we can pre-emptively take the
> > > > asconf_lock in sctp_close before locking the socket.
> > > 
> > > For your first approach, deferring the asconf list removal, we can only
> > > do that reliably via some work queue, because we initialize asconf stuff
> > > on sctp_init_sock() and it should be de-initialized on its counterpart,
> > > sctp_destroy_sock(), as we have code like:
> > > 
> > > (same for ipv4)
> > > sctp_v6_create_accept_sk()
> > > {
> > > ...
> > >         if (newsk->sk_prot->init(newsk)) {
> > >                 sk_common_release(newsk);
> > >                 newsk = NULL;
> > >         }
> > > ...
> > > }
> > > 
> > > and at inet6_create() too:
> > >         if (sk->sk_prot->init) {
> > >                 err = sk->sk_prot->init(sk);
> > >                 if (err)
> > >                         sk_common_release(sk);
> > >         }
> > > 
> > > Or we (kind of) abuse of knowing that sctp_init_sock() cannot fail after
> > > initializing asconf and move asconf stuff from sctp_destroy_sock() to
> > > sctp_close(). AFAICT it could be enough, I'm just not a big fan of not
> > > having that similarity.
> > > 
> > > If we try to lock addr_wq_lock early in sctp_close(),
> > > sctp_destroy_sock() would be unprotected on above situations, but if we
> > > know that sctp_init_sock() won't fail after initilizating asconf, it
> > > wouldn't be a problem...
> > > 
> > > > I'd really rather avoid creating an additional lock here if we don't have to
> > > 
> > > I understand. I'm just not seeing another way out so far... I'll keep
> > > trying, but please I'm all ears to ideas ;)
> > > 
> > >   Marcelo
> > rcu-ify auto_asconf_splist, then just use the addr_wq_lock to protect the list
> > modification sites, as those are all handled at locations that already hold a
> > socket lock.  sctp_addr_wq_timeout_handler is a read-only function for
> > auto_asconf_splist, and so the spin_lock_bh there can be replaced with an
> > rcu_read_lock_bh call, breaking the lock inversion
> > Neil
> 
> That's still the lock inversion, because at
> sctp_addr_wq_timeout_handler() we are not grabbing addr_wq_lock just for
> this patch, it's already there and later inside that for_each it will
> lock socket by socket; while on sctp_destroy_sock() , it is called with
> socket lock already held and we would be trying to grab ->addr_wq_lock
> in order to protect the list change.
> 
I get this, thats why I suggested rcu-ifying the list, because those loops are
read-only sites, you can replace the spinlock with an rcu lock.

> We could rcu-ify addr_waitq too. Let sctp_addr_wq_timeout_handler() work
> on the list protected by rcu_read_lock_bh() and only at the bottom of
> that for() we grab addr_wq_lock and do the list change. Would require
> more changes than just that in order to work properly (and avoid
> double-frees through that timer and sctp_free_addr_wq() for example)
> 
Ah, sorry, yes I missed the list_del at the bottom.  You're correct though, if
you rcu-ify both lists, you can just lock the addr_wq_lock there in the proper
order and do the list modification under protection.  Double free protection
shouldn't be hard there, just add another list iterator that searches for an
element who's pointer is the same as the element you want to remove.  Only one
context should ever find it if its being removed.

Regards
Neil

>   Marcelo
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -30,12 +30,15 @@  struct netns_sctp {
 	struct list_head local_addr_list;
 	struct list_head addr_waitq;
 	struct timer_list addr_wq_timer;
-	struct list_head auto_asconf_splist;
+	struct list_head __rcu auto_asconf_splist;
 	spinlock_t addr_wq_lock;
 
 	/* Lock that protects the local_addr_list writers */
 	spinlock_t local_addr_lock;
 
+	/* Lock that protects the auto_asconf_splist writers */
+	spinlock_t auto_asconf_lock;
+
 	/* RFC2960 Section 14. Suggested SCTP Protocol Parameter Values
 	 *
 	 * The following protocol parameters are RECOMMENDED:
@@ -129,6 +132,7 @@  struct netns_sctp {
 
 	/* Threshold for autoclose timeout, in seconds. */
 	unsigned long max_autoclose;
+
 };
 
 #endif /* __NETNS_SCTP_H__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 2bb2fcf5b11f0387c81b860ad2d3a6607da19a7d..459d9b8193cb9193ee038a09b67d2b67fbf335f0 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -223,6 +223,8 @@  struct sctp_sock {
 	atomic_t pd_mode;
 	/* Receive to here while partial delivery is in effect. */
 	struct sk_buff_head pd_lobby;
+	/* These must be the last fields, as they will skipped on copies,
+	 * like on accept and peeloff operations */
 	struct list_head auto_asconf_list;
 	int do_auto_asconf;
 };
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 53b7acde9aa37bf3d4029c459421564d5270f4c0..6947feae5b27ffbc61666f27fda384d346b982ba 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -632,7 +632,9 @@  static void sctp_addr_wq_timeout_handler(unsigned long arg)
 			}
 		}
 #endif
-		list_for_each_entry(sp, &net->sctp.auto_asconf_splist, auto_asconf_list) {
+		rcu_read_lock();
+		list_for_each_entry_rcu(sp, &net->sctp.auto_asconf_splist,
+					auto_asconf_list) {
 			struct sock *sk;
 
 			sk = sctp_opt2sk(sp);
@@ -644,6 +646,7 @@  static void sctp_addr_wq_timeout_handler(unsigned long arg)
 				pr_debug("%s: sctp_asconf_mgmt failed\n", __func__);
 			bh_unlock_sock(sk);
 		}
+		rcu_read_unlock();
 #if IS_ENABLED(CONFIG_IPV6)
 free_next:
 #endif
@@ -1268,6 +1271,7 @@  static int __net_init sctp_net_init(struct net *net)
 	/* Initialize the local address list. */
 	INIT_LIST_HEAD(&net->sctp.local_addr_list);
 	spin_lock_init(&net->sctp.local_addr_lock);
+	spin_lock_init(&net->sctp.auto_asconf_lock);
 	sctp_get_local_addr_list(net);
 
 	/* Initialize the address event list */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f09de7fac2e6acddad8b2e046dbf626e329cb674..acc0fb0deed0e801bbb8460af0a965aedfdd0348 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3580,14 +3580,16 @@  static int sctp_setsockopt_auto_asconf(struct sock *sk, char __user *optval,
 	if ((val && sp->do_auto_asconf) || (!val && !sp->do_auto_asconf))
 		return 0;
 
+	spin_lock(&sock_net(sk)->sctp.auto_asconf_lock);
 	if (val == 0 && sp->do_auto_asconf) {
-		list_del(&sp->auto_asconf_list);
 		sp->do_auto_asconf = 0;
+		list_del_rcu(&sp->auto_asconf_list);
 	} else if (val && !sp->do_auto_asconf) {
-		list_add_tail(&sp->auto_asconf_list,
-		    &sock_net(sk)->sctp.auto_asconf_splist);
 		sp->do_auto_asconf = 1;
+		list_add_tail_rcu(&sp->auto_asconf_list,
+				  &sock_net(sk)->sctp.auto_asconf_splist);
 	}
+	spin_unlock(&sock_net(sk)->sctp.auto_asconf_lock);
 	return 0;
 }
 
@@ -4122,9 +4124,11 @@  static int sctp_init_sock(struct sock *sk)
 	percpu_counter_inc(&sctp_sockets_allocated);
 	sock_prot_inuse_add(net, sk->sk_prot, 1);
 	if (net->sctp.default_auto_asconf) {
-		list_add_tail(&sp->auto_asconf_list,
-		    &net->sctp.auto_asconf_splist);
+		spin_lock(&net->sctp.auto_asconf_lock);
 		sp->do_auto_asconf = 1;
+		list_add_tail_rcu(&sp->auto_asconf_list,
+				  &net->sctp.auto_asconf_splist);
+		spin_unlock(&net->sctp.auto_asconf_lock);
 	} else
 		sp->do_auto_asconf = 0;
 	local_bh_enable();
@@ -4148,8 +4152,10 @@  static void sctp_destroy_sock(struct sock *sk)
 		return;
 
 	if (sp->do_auto_asconf) {
+		spin_lock(&sock_net(sk)->sctp.auto_asconf_lock);
 		sp->do_auto_asconf = 0;
-		list_del(&sp->auto_asconf_list);
+		list_del_rcu(&sp->auto_asconf_list);
+		spin_unlock(&sock_net(sk)->sctp.auto_asconf_lock);
 	}
 	sctp_endpoint_free(sp->ep);
 	local_bh_disable();
@@ -7195,6 +7201,19 @@  void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newinet->mc_list = NULL;
 }
 
+static inline void sctp_copy_descendant(struct sock *sk_to,
+					const struct sock *sk_from)
+{
+	int ancestor_size = sizeof(struct inet_sock) +
+			    sizeof(struct sctp_sock) -
+			    offsetof(struct sctp_sock, auto_asconf_list);
+
+	if (sk_from->sk_family == PF_INET6)
+		ancestor_size += sizeof(struct ipv6_pinfo);
+
+	__inet_sk_copy_descendant(sk_to, sk_from, ancestor_size);
+}
+
 /* Populate the fields of the newsk from the oldsk and migrate the assoc
  * and its messages to the newsk.
  */
@@ -7209,7 +7228,6 @@  static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	struct sk_buff *skb, *tmp;
 	struct sctp_ulpevent *event;
 	struct sctp_bind_hashbucket *head;
-	struct list_head tmplist;
 
 	/* Migrate socket buffer sizes and all the socket level options to the
 	 * new socket.
@@ -7217,12 +7235,7 @@  static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	newsk->sk_sndbuf = oldsk->sk_sndbuf;
 	newsk->sk_rcvbuf = oldsk->sk_rcvbuf;
 	/* Brute force copy old sctp opt. */
-	if (oldsp->do_auto_asconf) {
-		memcpy(&tmplist, &newsp->auto_asconf_list, sizeof(tmplist));
-		inet_sk_copy_descendant(newsk, oldsk);
-		memcpy(&newsp->auto_asconf_list, &tmplist, sizeof(tmplist));
-	} else
-		inet_sk_copy_descendant(newsk, oldsk);
+	sctp_copy_descendant(newsk, oldsk);
 
 	/* Restore the ep value that was overwritten with the above structure
 	 * copy.