diff mbox series

[bpf,v3] xdp: fix hang while unregistering device bound to xdp socket

Message ID 20190610161546.30569-1-i.maximets@samsung.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series [bpf,v3] xdp: fix hang while unregistering device bound to xdp socket | expand

Commit Message

Ilya Maximets June 10, 2019, 4:15 p.m. UTC
Device that bound to XDP socket will not have zero refcount until the
userspace application will not close it. This leads to hang inside
'netdev_wait_allrefs()' if device unregistering requested:

  # ip link del p1
  < hang on recvmsg on netlink socket >

  # ps -x | grep ip
  5126  pts/0    D+   0:00 ip link del p1

  # journalctl -b

  Jun 05 07:19:16 kernel:
  unregister_netdevice: waiting for p1 to become free. Usage count = 1

  Jun 05 07:19:27 kernel:
  unregister_netdevice: waiting for p1 to become free. Usage count = 1
  ...

Fix that by implementing NETDEV_UNREGISTER event notification handler
to properly clean up all the resources and unref device.

This should also allow socket killing via ss(8) utility.

Fixes: 965a99098443 ("xsk: add support for bind for Rx")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
---

Version 3:

    * Declaration lines ordered from longest to shortest.
    * Checking of event type moved to the top to avoid unnecessary
      locking.

Version 2:

    * Completely re-implemented using netdev event handler.

 net/xdp/xsk.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

Comments

Andrii Nakryiko June 10, 2019, 7 p.m. UTC | #1
On Mon, Jun 10, 2019 at 9:39 AM Ilya Maximets <i.maximets@samsung.com> wrote:
>
> Device that bound to XDP socket will not have zero refcount until the
> userspace application will not close it. This leads to hang inside
> 'netdev_wait_allrefs()' if device unregistering requested:
>
>   # ip link del p1
>   < hang on recvmsg on netlink socket >
>
>   # ps -x | grep ip
>   5126  pts/0    D+   0:00 ip link del p1
>
>   # journalctl -b
>
>   Jun 05 07:19:16 kernel:
>   unregister_netdevice: waiting for p1 to become free. Usage count = 1
>
>   Jun 05 07:19:27 kernel:
>   unregister_netdevice: waiting for p1 to become free. Usage count = 1
>   ...
>
> Fix that by implementing NETDEV_UNREGISTER event notification handler
> to properly clean up all the resources and unref device.
>
> This should also allow socket killing via ss(8) utility.
>
> Fixes: 965a99098443 ("xsk: add support for bind for Rx")
> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
> ---
>
> Version 3:
>
>     * Declaration lines ordered from longest to shortest.
>     * Checking of event type moved to the top to avoid unnecessary
>       locking.
>
> Version 2:
>
>     * Completely re-implemented using netdev event handler.
>
>  net/xdp/xsk.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 64 insertions(+), 1 deletion(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index a14e8864e4fa..273a419a8c4d 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -693,6 +693,57 @@ static int xsk_mmap(struct file *file, struct socket *sock,
>                                size, vma->vm_page_prot);
>  }
>
> +static int xsk_notifier(struct notifier_block *this,
> +                       unsigned long msg, void *ptr)
> +{
> +       struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> +       struct net *net = dev_net(dev);
> +       int i, unregister_count = 0;
> +       struct sock *sk;
> +
> +       switch (msg) {
> +       case NETDEV_UNREGISTER:
> +               mutex_lock(&net->xdp.lock);
> +               sk_for_each(sk, &net->xdp.list) {
> +                       struct xdp_sock *xs = xdp_sk(sk);
> +
> +                       mutex_lock(&xs->mutex);
> +                       if (dev != xs->dev) {
> +                               mutex_unlock(&xs->mutex);
> +                               continue;
> +                       }
> +
> +                       sk->sk_err = ENETDOWN;
> +                       if (!sock_flag(sk, SOCK_DEAD))
> +                               sk->sk_error_report(sk);
> +
> +                       /* Wait for driver to stop using the xdp socket. */
> +                       xdp_del_sk_umem(xs->umem, xs);
> +                       xs->dev = NULL;
> +                       synchronize_net();
> +
> +                       /* Clear device references in umem. */
> +                       xdp_put_umem(xs->umem);
> +                       xs->umem = NULL;
> +
> +                       mutex_unlock(&xs->mutex);
> +                       unregister_count++;
> +               }
> +               mutex_unlock(&net->xdp.lock);
> +
> +               if (unregister_count) {
> +                       /* Wait for umem clearing completion. */
> +                       synchronize_net();
> +                       for (i = 0; i < unregister_count; i++)
> +                               dev_put(dev);
> +               }
> +
> +               break;
> +       }
> +
> +       return NOTIFY_DONE;
> +}
> +
>  static struct proto xsk_proto = {
>         .name =         "XDP",
>         .owner =        THIS_MODULE,
> @@ -727,7 +778,8 @@ static void xsk_destruct(struct sock *sk)
>         if (!sock_flag(sk, SOCK_DEAD))
>                 return;
>
> -       xdp_put_umem(xs->umem);
> +       if (xs->umem)
> +               xdp_put_umem(xs->umem);

xpd_put_umem already checks for NULL umem, so you don't have to do it here.

>
>         sk_refcnt_debug_dec(sk);
>  }
> @@ -784,6 +836,10 @@ static const struct net_proto_family xsk_family_ops = {
>         .owner  = THIS_MODULE,
>  };
>
> +static struct notifier_block xsk_netdev_notifier = {
> +       .notifier_call  = xsk_notifier,
> +};
> +
>  static int __net_init xsk_net_init(struct net *net)
>  {
>         mutex_init(&net->xdp.lock);
> @@ -816,8 +872,15 @@ static int __init xsk_init(void)
>         err = register_pernet_subsys(&xsk_net_ops);
>         if (err)
>                 goto out_sk;
> +
> +       err = register_netdevice_notifier(&xsk_netdev_notifier);
> +       if (err)
> +               goto out_pernet;
> +
>         return 0;
>
> +out_pernet:
> +       unregister_pernet_subsys(&xsk_net_ops);
>  out_sk:
>         sock_unregister(PF_XDP);
>  out_proto:
> --
> 2.17.1
>
William Tu June 10, 2019, 7:45 p.m. UTC | #2
On Mon, Jun 10, 2019 at 9:39 AM Ilya Maximets <i.maximets@samsung.com> wrote:
>
> Device that bound to XDP socket will not have zero refcount until the
> userspace application will not close it. This leads to hang inside
> 'netdev_wait_allrefs()' if device unregistering requested:
>
>   # ip link del p1
>   < hang on recvmsg on netlink socket >
>
>   # ps -x | grep ip
>   5126  pts/0    D+   0:00 ip link del p1
>
>   # journalctl -b
>
>   Jun 05 07:19:16 kernel:
>   unregister_netdevice: waiting for p1 to become free. Usage count = 1
>
>   Jun 05 07:19:27 kernel:
>   unregister_netdevice: waiting for p1 to become free. Usage count = 1
>   ...

Thanks. I hit the same issue quite often when using veth driver to
test AF_XDP.

>
> Fix that by implementing NETDEV_UNREGISTER event notification handler
> to properly clean up all the resources and unref device.
>
> This should also allow socket killing via ss(8) utility.
>
> Fixes: 965a99098443 ("xsk: add support for bind for Rx")
> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
> ---
Tested-by: William Tu <u9012063@gmail.com>
Jonathan Lemon June 10, 2019, 8:47 p.m. UTC | #3
On 10 Jun 2019, at 9:15, Ilya Maximets wrote:

> Device that bound to XDP socket will not have zero refcount until the
> userspace application will not close it. This leads to hang inside
> 'netdev_wait_allrefs()' if device unregistering requested:
>
>   # ip link del p1
>   < hang on recvmsg on netlink socket >
>
>   # ps -x | grep ip
>   5126  pts/0    D+   0:00 ip link del p1
>
>   # journalctl -b
>
>   Jun 05 07:19:16 kernel:
>   unregister_netdevice: waiting for p1 to become free. Usage count = 1
>
>   Jun 05 07:19:27 kernel:
>   unregister_netdevice: waiting for p1 to become free. Usage count = 1
>   ...
>
> Fix that by implementing NETDEV_UNREGISTER event notification handler
> to properly clean up all the resources and unref device.
>
> This should also allow socket killing via ss(8) utility.
>
> Fixes: 965a99098443 ("xsk: add support for bind for Rx")
> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
> ---
>
> Version 3:
>
>     * Declaration lines ordered from longest to shortest.
>     * Checking of event type moved to the top to avoid unnecessary
>       locking.
>
> Version 2:
>
>     * Completely re-implemented using netdev event handler.
>
>  net/xdp/xsk.c | 65 
> ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 64 insertions(+), 1 deletion(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index a14e8864e4fa..273a419a8c4d 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -693,6 +693,57 @@ static int xsk_mmap(struct file *file, struct 
> socket *sock,
>  			       size, vma->vm_page_prot);
>  }
>
> +static int xsk_notifier(struct notifier_block *this,
> +			unsigned long msg, void *ptr)
> +{
> +	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> +	struct net *net = dev_net(dev);
> +	int i, unregister_count = 0;
> +	struct sock *sk;
> +
> +	switch (msg) {
> +	case NETDEV_UNREGISTER:
> +		mutex_lock(&net->xdp.lock);

The call is under the rtnl lock, and we're not modifying
the list, so this mutex shouldn't be needed.


> +		sk_for_each(sk, &net->xdp.list) {
> +			struct xdp_sock *xs = xdp_sk(sk);
> +
> +			mutex_lock(&xs->mutex);
> +			if (dev != xs->dev) {
> +				mutex_unlock(&xs->mutex);
> +				continue;
> +			}
> +
> +			sk->sk_err = ENETDOWN;
> +			if (!sock_flag(sk, SOCK_DEAD))
> +				sk->sk_error_report(sk);
> +
> +			/* Wait for driver to stop using the xdp socket. */
> +			xdp_del_sk_umem(xs->umem, xs);
> +			xs->dev = NULL;
> +			synchronize_net();
Isn't this by handled by the unregister_count case below?

> +
> +			/* Clear device references in umem. */
> +			xdp_put_umem(xs->umem);
> +			xs->umem = NULL;

This makes me uneasy.  We need to unregister the umem from
the device (xdp_umem_clear_dev()) but this can remove the umem
pages out from underneath the xsk.

Perhaps what's needed here is the equivalent of an unbind()
call that just detaches the umem/sk from the device, but does
not otherwise tear them down.


> +			mutex_unlock(&xs->mutex);
> +			unregister_count++;
> +		}
> +		mutex_unlock(&net->xdp.lock);
> +
> +		if (unregister_count) {
> +			/* Wait for umem clearing completion. */
> +			synchronize_net();
> +			for (i = 0; i < unregister_count; i++)
> +				dev_put(dev);
> +		}
> +
> +		break;
> +	}
> +
> +	return NOTIFY_DONE;
> +}
> +
>  static struct proto xsk_proto = {
>  	.name =		"XDP",
>  	.owner =	THIS_MODULE,
> @@ -727,7 +778,8 @@ static void xsk_destruct(struct sock *sk)
>  	if (!sock_flag(sk, SOCK_DEAD))
>  		return;
>
> -	xdp_put_umem(xs->umem);
> +	if (xs->umem)
> +		xdp_put_umem(xs->umem);
Not needed - xdp_put_umem() already does a null check.
Björn Töpel June 11, 2019, 8:09 a.m. UTC | #4
On Mon, 10 Jun 2019 at 22:49, Jonathan Lemon <jonathan.lemon@gmail.com> wrote:
>
> On 10 Jun 2019, at 9:15, Ilya Maximets wrote:
>
> > Device that bound to XDP socket will not have zero refcount until the
> > userspace application will not close it. This leads to hang inside
> > 'netdev_wait_allrefs()' if device unregistering requested:
> >
> >   # ip link del p1
> >   < hang on recvmsg on netlink socket >
> >
> >   # ps -x | grep ip
> >   5126  pts/0    D+   0:00 ip link del p1
> >
> >   # journalctl -b
> >
> >   Jun 05 07:19:16 kernel:
> >   unregister_netdevice: waiting for p1 to become free. Usage count = 1
> >
> >   Jun 05 07:19:27 kernel:
> >   unregister_netdevice: waiting for p1 to become free. Usage count = 1
> >   ...
> >
> > Fix that by implementing NETDEV_UNREGISTER event notification handler
> > to properly clean up all the resources and unref device.
> >
> > This should also allow socket killing via ss(8) utility.
> >
> > Fixes: 965a99098443 ("xsk: add support for bind for Rx")
> > Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
> > ---
> >
> > Version 3:
> >
> >     * Declaration lines ordered from longest to shortest.
> >     * Checking of event type moved to the top to avoid unnecessary
> >       locking.
> >
> > Version 2:
> >
> >     * Completely re-implemented using netdev event handler.
> >
> >  net/xdp/xsk.c | 65
> > ++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 64 insertions(+), 1 deletion(-)
> >
> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > index a14e8864e4fa..273a419a8c4d 100644
> > --- a/net/xdp/xsk.c
> > +++ b/net/xdp/xsk.c
> > @@ -693,6 +693,57 @@ static int xsk_mmap(struct file *file, struct
> > socket *sock,
> >                              size, vma->vm_page_prot);
> >  }
> >
> > +static int xsk_notifier(struct notifier_block *this,
> > +                     unsigned long msg, void *ptr)
> > +{
> > +     struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> > +     struct net *net = dev_net(dev);
> > +     int i, unregister_count = 0;
> > +     struct sock *sk;
> > +
> > +     switch (msg) {
> > +     case NETDEV_UNREGISTER:
> > +             mutex_lock(&net->xdp.lock);
>
> The call is under the rtnl lock, and we're not modifying
> the list, so this mutex shouldn't be needed.
>

The list can, however, be modified outside the rtnl lock (e.g. at
socket creation). AFAIK the hlist cannot be traversed lock-less,
right?

>
> > +             sk_for_each(sk, &net->xdp.list) {
> > +                     struct xdp_sock *xs = xdp_sk(sk);
> > +
> > +                     mutex_lock(&xs->mutex);
> > +                     if (dev != xs->dev) {
> > +                             mutex_unlock(&xs->mutex);
> > +                             continue;
> > +                     }
> > +
> > +                     sk->sk_err = ENETDOWN;
> > +                     if (!sock_flag(sk, SOCK_DEAD))
> > +                             sk->sk_error_report(sk);
> > +
> > +                     /* Wait for driver to stop using the xdp socket. */
> > +                     xdp_del_sk_umem(xs->umem, xs);
> > +                     xs->dev = NULL;
> > +                     synchronize_net();
> Isn't this by handled by the unregister_count case below?
>

To clarify, setting dev to NULL and xdp_del_sk_umem() + sync makes
sure that a driver doesn't touch the Tx and Rx rings. Nothing can be
assumed about completion + fill ring (umem), until zero-copy has been
disabled via ndo_bpf.

> > +
> > +                     /* Clear device references in umem. */
> > +                     xdp_put_umem(xs->umem);
> > +                     xs->umem = NULL;
>
> This makes me uneasy.  We need to unregister the umem from
> the device (xdp_umem_clear_dev()) but this can remove the umem
> pages out from underneath the xsk.
>

Yes, this is scary. The socket is alive, and userland typically has
the fill/completion rings mmapped. Then the umem refcount is decreased
and can potentially free the umem (fill rings etc.), as Jonathan says,
underneath the xsk. Also, setting the xs umem/dev to zero, while the
socket is alive, would allow a user to re-setup the socket, which we
don't want to allow.

> Perhaps what's needed here is the equivalent of an unbind()
> call that just detaches the umem/sk from the device, but does
> not otherwise tear them down.
>

Yeah, I agree. A detached/zombie state is needed during the socket lifetime.

>
> > +                     mutex_unlock(&xs->mutex);
> > +                     unregister_count++;
> > +             }
> > +             mutex_unlock(&net->xdp.lock);
> > +
> > +             if (unregister_count) {
> > +                     /* Wait for umem clearing completion. */
> > +                     synchronize_net();
> > +                     for (i = 0; i < unregister_count; i++)
> > +                             dev_put(dev);
> > +             }
> > +
> > +             break;
> > +     }
> > +
> > +     return NOTIFY_DONE;
> > +}
> > +
> >  static struct proto xsk_proto = {
> >       .name =         "XDP",
> >       .owner =        THIS_MODULE,
> > @@ -727,7 +778,8 @@ static void xsk_destruct(struct sock *sk)
> >       if (!sock_flag(sk, SOCK_DEAD))
> >               return;
> >
> > -     xdp_put_umem(xs->umem);
> > +     if (xs->umem)
> > +             xdp_put_umem(xs->umem);
> Not needed - xdp_put_umem() already does a null check.
> --
> Jonathan
>
>
> >
> >       sk_refcnt_debug_dec(sk);
> >  }
> > @@ -784,6 +836,10 @@ static const struct net_proto_family
> > xsk_family_ops = {
> >       .owner  = THIS_MODULE,
> >  };
> >
> > +static struct notifier_block xsk_netdev_notifier = {
> > +     .notifier_call  = xsk_notifier,
> > +};
> > +
> >  static int __net_init xsk_net_init(struct net *net)
> >  {
> >       mutex_init(&net->xdp.lock);
> > @@ -816,8 +872,15 @@ static int __init xsk_init(void)
> >       err = register_pernet_subsys(&xsk_net_ops);
> >       if (err)
> >               goto out_sk;
> > +
> > +     err = register_netdevice_notifier(&xsk_netdev_notifier);
> > +     if (err)
> > +             goto out_pernet;
> > +
> >       return 0;
> >
> > +out_pernet:
> > +     unregister_pernet_subsys(&xsk_net_ops);
> >  out_sk:
> >       sock_unregister(PF_XDP);
> >  out_proto:
> > --
> > 2.17.1
Ilya Maximets June 11, 2019, 8:42 a.m. UTC | #5
On 11.06.2019 11:09, Björn Töpel wrote:
> On Mon, 10 Jun 2019 at 22:49, Jonathan Lemon <jonathan.lemon@gmail.com> wrote:
>>
>> On 10 Jun 2019, at 9:15, Ilya Maximets wrote:
>>
>>> Device that bound to XDP socket will not have zero refcount until the
>>> userspace application will not close it. This leads to hang inside
>>> 'netdev_wait_allrefs()' if device unregistering requested:
>>>
>>>   # ip link del p1
>>>   < hang on recvmsg on netlink socket >
>>>
>>>   # ps -x | grep ip
>>>   5126  pts/0    D+   0:00 ip link del p1
>>>
>>>   # journalctl -b
>>>
>>>   Jun 05 07:19:16 kernel:
>>>   unregister_netdevice: waiting for p1 to become free. Usage count = 1
>>>
>>>   Jun 05 07:19:27 kernel:
>>>   unregister_netdevice: waiting for p1 to become free. Usage count = 1
>>>   ...
>>>
>>> Fix that by implementing NETDEV_UNREGISTER event notification handler
>>> to properly clean up all the resources and unref device.
>>>
>>> This should also allow socket killing via ss(8) utility.
>>>
>>> Fixes: 965a99098443 ("xsk: add support for bind for Rx")
>>> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
>>> ---
>>>
>>> Version 3:
>>>
>>>     * Declaration lines ordered from longest to shortest.
>>>     * Checking of event type moved to the top to avoid unnecessary
>>>       locking.
>>>
>>> Version 2:
>>>
>>>     * Completely re-implemented using netdev event handler.
>>>
>>>  net/xdp/xsk.c | 65
>>> ++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>  1 file changed, 64 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
>>> index a14e8864e4fa..273a419a8c4d 100644
>>> --- a/net/xdp/xsk.c
>>> +++ b/net/xdp/xsk.c
>>> @@ -693,6 +693,57 @@ static int xsk_mmap(struct file *file, struct
>>> socket *sock,
>>>                              size, vma->vm_page_prot);
>>>  }
>>>
>>> +static int xsk_notifier(struct notifier_block *this,
>>> +                     unsigned long msg, void *ptr)
>>> +{
>>> +     struct net_device *dev = netdev_notifier_info_to_dev(ptr);
>>> +     struct net *net = dev_net(dev);
>>> +     int i, unregister_count = 0;
>>> +     struct sock *sk;
>>> +
>>> +     switch (msg) {
>>> +     case NETDEV_UNREGISTER:
>>> +             mutex_lock(&net->xdp.lock);
>>
>> The call is under the rtnl lock, and we're not modifying
>> the list, so this mutex shouldn't be needed.
>>
> 
> The list can, however, be modified outside the rtnl lock (e.g. at
> socket creation). AFAIK the hlist cannot be traversed lock-less,
> right?

We could use 'rcu_read_lock' instead and iterate with 'sk_for_each_rcu',
but we'll not be able to synchronize inside.

> 
>>
>>> +             sk_for_each(sk, &net->xdp.list) {
>>> +                     struct xdp_sock *xs = xdp_sk(sk);
>>> +
>>> +                     mutex_lock(&xs->mutex);
>>> +                     if (dev != xs->dev) {
>>> +                             mutex_unlock(&xs->mutex);
>>> +                             continue;
>>> +                     }
>>> +
>>> +                     sk->sk_err = ENETDOWN;
>>> +                     if (!sock_flag(sk, SOCK_DEAD))
>>> +                             sk->sk_error_report(sk);
>>> +
>>> +                     /* Wait for driver to stop using the xdp socket. */
>>> +                     xdp_del_sk_umem(xs->umem, xs);
>>> +                     xs->dev = NULL;
>>> +                     synchronize_net();
>> Isn't this by handled by the unregister_count case below?
>>
> 
> To clarify, setting dev to NULL and xdp_del_sk_umem() + sync makes
> sure that a driver doesn't touch the Tx and Rx rings. Nothing can be
> assumed about completion + fill ring (umem), until zero-copy has been
> disabled via ndo_bpf.
> 
>>> +
>>> +                     /* Clear device references in umem. */
>>> +                     xdp_put_umem(xs->umem);
>>> +                     xs->umem = NULL;
>>
>> This makes me uneasy.  We need to unregister the umem from
>> the device (xdp_umem_clear_dev()) but this can remove the umem
>> pages out from underneath the xsk.
>>
> 
> Yes, this is scary. The socket is alive, and userland typically has
> the fill/completion rings mmapped. Then the umem refcount is decreased
> and can potentially free the umem (fill rings etc.), as Jonathan says,
> underneath the xsk. Also, setting the xs umem/dev to zero, while the
> socket is alive, would allow a user to re-setup the socket, which we
> don't want to allow.
> 
>> Perhaps what's needed here is the equivalent of an unbind()
>> call that just detaches the umem/sk from the device, but does
>> not otherwise tear them down.
>>
> 
> Yeah, I agree. A detached/zombie state is needed during the socket lifetime.


I could try to rip the 'xdp_umem_release()' apart so the 'xdp_umem_clear_dev()'
could be called separately. This will allow to not tear down the 'umem'.
However, it seems that it'll not be easy to synchronize all parts.
Any suggestions are welcome.

Also, there is no way to not clear the 'dev' as we have to put the reference.
Maybe we could add the additional check to 'xsk_bind()' for current device
state (dev->reg_state == NETREG_REGISTERED). This will allow us to avoid
re-setup of the socket.

> 
>>
>>> +                     mutex_unlock(&xs->mutex);
>>> +                     unregister_count++;
>>> +             }
>>> +             mutex_unlock(&net->xdp.lock);
>>> +
>>> +             if (unregister_count) {
>>> +                     /* Wait for umem clearing completion. */
>>> +                     synchronize_net();
>>> +                     for (i = 0; i < unregister_count; i++)
>>> +                             dev_put(dev);
>>> +             }
>>> +
>>> +             break;
>>> +     }
>>> +
>>> +     return NOTIFY_DONE;
>>> +}
>>> +
>>>  static struct proto xsk_proto = {
>>>       .name =         "XDP",
>>>       .owner =        THIS_MODULE,
>>> @@ -727,7 +778,8 @@ static void xsk_destruct(struct sock *sk)
>>>       if (!sock_flag(sk, SOCK_DEAD))
>>>               return;
>>>
>>> -     xdp_put_umem(xs->umem);
>>> +     if (xs->umem)
>>> +             xdp_put_umem(xs->umem);
>> Not needed - xdp_put_umem() already does a null check.

Indeed. Thanks.

>> --
>> Jonathan
>>
>>
>>>
>>>       sk_refcnt_debug_dec(sk);
>>>  }
>>> @@ -784,6 +836,10 @@ static const struct net_proto_family
>>> xsk_family_ops = {
>>>       .owner  = THIS_MODULE,
>>>  };
>>>
>>> +static struct notifier_block xsk_netdev_notifier = {
>>> +     .notifier_call  = xsk_notifier,
>>> +};
>>> +
>>>  static int __net_init xsk_net_init(struct net *net)
>>>  {
>>>       mutex_init(&net->xdp.lock);
>>> @@ -816,8 +872,15 @@ static int __init xsk_init(void)
>>>       err = register_pernet_subsys(&xsk_net_ops);
>>>       if (err)
>>>               goto out_sk;
>>> +
>>> +     err = register_netdevice_notifier(&xsk_netdev_notifier);
>>> +     if (err)
>>> +             goto out_pernet;
>>> +
>>>       return 0;
>>>
>>> +out_pernet:
>>> +     unregister_pernet_subsys(&xsk_net_ops);
>>>  out_sk:
>>>       sock_unregister(PF_XDP);
>>>  out_proto:
>>> --
>>> 2.17.1
> 
>
Björn Töpel June 11, 2019, 12:13 p.m. UTC | #6
On Tue, 11 Jun 2019 at 10:42, Ilya Maximets <i.maximets@samsung.com> wrote:
>
> On 11.06.2019 11:09, Björn Töpel wrote:
> > On Mon, 10 Jun 2019 at 22:49, Jonathan Lemon <jonathan.lemon@gmail.com> wrote:
> >>
> >> On 10 Jun 2019, at 9:15, Ilya Maximets wrote:
> >>
> >>> Device that bound to XDP socket will not have zero refcount until the
> >>> userspace application will not close it. This leads to hang inside
> >>> 'netdev_wait_allrefs()' if device unregistering requested:
> >>>
> >>>   # ip link del p1
> >>>   < hang on recvmsg on netlink socket >
> >>>
> >>>   # ps -x | grep ip
> >>>   5126  pts/0    D+   0:00 ip link del p1
> >>>
> >>>   # journalctl -b
> >>>
> >>>   Jun 05 07:19:16 kernel:
> >>>   unregister_netdevice: waiting for p1 to become free. Usage count = 1
> >>>
> >>>   Jun 05 07:19:27 kernel:
> >>>   unregister_netdevice: waiting for p1 to become free. Usage count = 1
> >>>   ...
> >>>
> >>> Fix that by implementing NETDEV_UNREGISTER event notification handler
> >>> to properly clean up all the resources and unref device.
> >>>
> >>> This should also allow socket killing via ss(8) utility.
> >>>
> >>> Fixes: 965a99098443 ("xsk: add support for bind for Rx")
> >>> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
> >>> ---
> >>>
> >>> Version 3:
> >>>
> >>>     * Declaration lines ordered from longest to shortest.
> >>>     * Checking of event type moved to the top to avoid unnecessary
> >>>       locking.
> >>>
> >>> Version 2:
> >>>
> >>>     * Completely re-implemented using netdev event handler.
> >>>
> >>>  net/xdp/xsk.c | 65
> >>> ++++++++++++++++++++++++++++++++++++++++++++++++++-
> >>>  1 file changed, 64 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> >>> index a14e8864e4fa..273a419a8c4d 100644
> >>> --- a/net/xdp/xsk.c
> >>> +++ b/net/xdp/xsk.c
> >>> @@ -693,6 +693,57 @@ static int xsk_mmap(struct file *file, struct
> >>> socket *sock,
> >>>                              size, vma->vm_page_prot);
> >>>  }
> >>>
> >>> +static int xsk_notifier(struct notifier_block *this,
> >>> +                     unsigned long msg, void *ptr)
> >>> +{
> >>> +     struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> >>> +     struct net *net = dev_net(dev);
> >>> +     int i, unregister_count = 0;
> >>> +     struct sock *sk;
> >>> +
> >>> +     switch (msg) {
> >>> +     case NETDEV_UNREGISTER:
> >>> +             mutex_lock(&net->xdp.lock);
> >>
> >> The call is under the rtnl lock, and we're not modifying
> >> the list, so this mutex shouldn't be needed.
> >>
> >
> > The list can, however, be modified outside the rtnl lock (e.g. at
> > socket creation). AFAIK the hlist cannot be traversed lock-less,
> > right?
>
> We could use 'rcu_read_lock' instead and iterate with 'sk_for_each_rcu',
> but we'll not be able to synchronize inside.
>
> >
> >>
> >>> +             sk_for_each(sk, &net->xdp.list) {
> >>> +                     struct xdp_sock *xs = xdp_sk(sk);
> >>> +
> >>> +                     mutex_lock(&xs->mutex);
> >>> +                     if (dev != xs->dev) {
> >>> +                             mutex_unlock(&xs->mutex);
> >>> +                             continue;
> >>> +                     }
> >>> +
> >>> +                     sk->sk_err = ENETDOWN;
> >>> +                     if (!sock_flag(sk, SOCK_DEAD))
> >>> +                             sk->sk_error_report(sk);
> >>> +
> >>> +                     /* Wait for driver to stop using the xdp socket. */
> >>> +                     xdp_del_sk_umem(xs->umem, xs);
> >>> +                     xs->dev = NULL;
> >>> +                     synchronize_net();
> >> Isn't this by handled by the unregister_count case below?
> >>
> >
> > To clarify, setting dev to NULL and xdp_del_sk_umem() + sync makes
> > sure that a driver doesn't touch the Tx and Rx rings. Nothing can be
> > assumed about completion + fill ring (umem), until zero-copy has been
> > disabled via ndo_bpf.
> >
> >>> +
> >>> +                     /* Clear device references in umem. */
> >>> +                     xdp_put_umem(xs->umem);
> >>> +                     xs->umem = NULL;
> >>
> >> This makes me uneasy.  We need to unregister the umem from
> >> the device (xdp_umem_clear_dev()) but this can remove the umem
> >> pages out from underneath the xsk.
> >>
> >
> > Yes, this is scary. The socket is alive, and userland typically has
> > the fill/completion rings mmapped. Then the umem refcount is decreased
> > and can potentially free the umem (fill rings etc.), as Jonathan says,
> > underneath the xsk. Also, setting the xs umem/dev to zero, while the
> > socket is alive, would allow a user to re-setup the socket, which we
> > don't want to allow.
> >
> >> Perhaps what's needed here is the equivalent of an unbind()
> >> call that just detaches the umem/sk from the device, but does
> >> not otherwise tear them down.
> >>
> >
> > Yeah, I agree. A detached/zombie state is needed during the socket lifetime.
>
>
> I could try to rip the 'xdp_umem_release()' apart so the 'xdp_umem_clear_dev()'
> could be called separately. This will allow to not tear down the 'umem'.
> However, it seems that it'll not be easy to synchronize all parts.
> Any suggestions are welcome.
>

Thanks for continuing to work on this, Ilya.

What need to be done is exactly an "unbind()", i.e. returning the
socket to the state prior bind, but disallowing any changes from
userland (e.g. setsockopt/bind). So, unbind() + track that we're in
"unbound" mode. :-) I think breaking up xdp_umem_release() is good way
to go.

> Also, there is no way to not clear the 'dev' as we have to put the reference.
> Maybe we could add the additional check to 'xsk_bind()' for current device
> state (dev->reg_state == NETREG_REGISTERED). This will allow us to avoid
> re-setup of the socket.
>

Yes, and also make sure that the rest of the syscall implementations
don't allow for re-setup.


Björn

> >
> >>
> >>> +                     mutex_unlock(&xs->mutex);
> >>> +                     unregister_count++;
> >>> +             }
> >>> +             mutex_unlock(&net->xdp.lock);
> >>> +
> >>> +             if (unregister_count) {
> >>> +                     /* Wait for umem clearing completion. */
> >>> +                     synchronize_net();
> >>> +                     for (i = 0; i < unregister_count; i++)
> >>> +                             dev_put(dev);
> >>> +             }
> >>> +
> >>> +             break;
> >>> +     }
> >>> +
> >>> +     return NOTIFY_DONE;
> >>> +}
> >>> +
> >>>  static struct proto xsk_proto = {
> >>>       .name =         "XDP",
> >>>       .owner =        THIS_MODULE,
> >>> @@ -727,7 +778,8 @@ static void xsk_destruct(struct sock *sk)
> >>>       if (!sock_flag(sk, SOCK_DEAD))
> >>>               return;
> >>>
> >>> -     xdp_put_umem(xs->umem);
> >>> +     if (xs->umem)
> >>> +             xdp_put_umem(xs->umem);
> >> Not needed - xdp_put_umem() already does a null check.
>
> Indeed. Thanks.
>
> >> --
> >> Jonathan
> >>
> >>
> >>>
> >>>       sk_refcnt_debug_dec(sk);
> >>>  }
> >>> @@ -784,6 +836,10 @@ static const struct net_proto_family
> >>> xsk_family_ops = {
> >>>       .owner  = THIS_MODULE,
> >>>  };
> >>>
> >>> +static struct notifier_block xsk_netdev_notifier = {
> >>> +     .notifier_call  = xsk_notifier,
> >>> +};
> >>> +
> >>>  static int __net_init xsk_net_init(struct net *net)
> >>>  {
> >>>       mutex_init(&net->xdp.lock);
> >>> @@ -816,8 +872,15 @@ static int __init xsk_init(void)
> >>>       err = register_pernet_subsys(&xsk_net_ops);
> >>>       if (err)
> >>>               goto out_sk;
> >>> +
> >>> +     err = register_netdevice_notifier(&xsk_netdev_notifier);
> >>> +     if (err)
> >>> +             goto out_pernet;
> >>> +
> >>>       return 0;
> >>>
> >>> +out_pernet:
> >>> +     unregister_pernet_subsys(&xsk_net_ops);
> >>>  out_sk:
> >>>       sock_unregister(PF_XDP);
> >>>  out_proto:
> >>> --
> >>> 2.17.1
> >
> >
Ilya Maximets June 11, 2019, 3:42 p.m. UTC | #7
On 11.06.2019 15:13, Björn Töpel wrote:
> On Tue, 11 Jun 2019 at 10:42, Ilya Maximets <i.maximets@samsung.com> wrote:
>>
>> On 11.06.2019 11:09, Björn Töpel wrote:
>>> On Mon, 10 Jun 2019 at 22:49, Jonathan Lemon <jonathan.lemon@gmail.com> wrote:
>>>>
>>>> On 10 Jun 2019, at 9:15, Ilya Maximets wrote:
>>>>
>>>>> Device that bound to XDP socket will not have zero refcount until the
>>>>> userspace application will not close it. This leads to hang inside
>>>>> 'netdev_wait_allrefs()' if device unregistering requested:
>>>>>
>>>>>   # ip link del p1
>>>>>   < hang on recvmsg on netlink socket >
>>>>>
>>>>>   # ps -x | grep ip
>>>>>   5126  pts/0    D+   0:00 ip link del p1
>>>>>
>>>>>   # journalctl -b
>>>>>
>>>>>   Jun 05 07:19:16 kernel:
>>>>>   unregister_netdevice: waiting for p1 to become free. Usage count = 1
>>>>>
>>>>>   Jun 05 07:19:27 kernel:
>>>>>   unregister_netdevice: waiting for p1 to become free. Usage count = 1
>>>>>   ...
>>>>>
>>>>> Fix that by implementing NETDEV_UNREGISTER event notification handler
>>>>> to properly clean up all the resources and unref device.
>>>>>
>>>>> This should also allow socket killing via ss(8) utility.
>>>>>
>>>>> Fixes: 965a99098443 ("xsk: add support for bind for Rx")
>>>>> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
>>>>> ---
>>>>>
>>>>> Version 3:
>>>>>
>>>>>     * Declaration lines ordered from longest to shortest.
>>>>>     * Checking of event type moved to the top to avoid unnecessary
>>>>>       locking.
>>>>>
>>>>> Version 2:
>>>>>
>>>>>     * Completely re-implemented using netdev event handler.
>>>>>
>>>>>  net/xdp/xsk.c | 65
>>>>> ++++++++++++++++++++++++++++++++++++++++++++++++++-
>>>>>  1 file changed, 64 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
>>>>> index a14e8864e4fa..273a419a8c4d 100644
>>>>> --- a/net/xdp/xsk.c
>>>>> +++ b/net/xdp/xsk.c
>>>>> @@ -693,6 +693,57 @@ static int xsk_mmap(struct file *file, struct
>>>>> socket *sock,
>>>>>                              size, vma->vm_page_prot);
>>>>>  }
>>>>>
>>>>> +static int xsk_notifier(struct notifier_block *this,
>>>>> +                     unsigned long msg, void *ptr)
>>>>> +{
>>>>> +     struct net_device *dev = netdev_notifier_info_to_dev(ptr);
>>>>> +     struct net *net = dev_net(dev);
>>>>> +     int i, unregister_count = 0;
>>>>> +     struct sock *sk;
>>>>> +
>>>>> +     switch (msg) {
>>>>> +     case NETDEV_UNREGISTER:
>>>>> +             mutex_lock(&net->xdp.lock);
>>>>
>>>> The call is under the rtnl lock, and we're not modifying
>>>> the list, so this mutex shouldn't be needed.
>>>>
>>>
>>> The list can, however, be modified outside the rtnl lock (e.g. at
>>> socket creation). AFAIK the hlist cannot be traversed lock-less,
>>> right?
>>
>> We could use 'rcu_read_lock' instead and iterate with 'sk_for_each_rcu',
>> but we'll not be able to synchronize inside.
>>
>>>
>>>>
>>>>> +             sk_for_each(sk, &net->xdp.list) {
>>>>> +                     struct xdp_sock *xs = xdp_sk(sk);
>>>>> +
>>>>> +                     mutex_lock(&xs->mutex);
>>>>> +                     if (dev != xs->dev) {
>>>>> +                             mutex_unlock(&xs->mutex);
>>>>> +                             continue;
>>>>> +                     }
>>>>> +
>>>>> +                     sk->sk_err = ENETDOWN;
>>>>> +                     if (!sock_flag(sk, SOCK_DEAD))
>>>>> +                             sk->sk_error_report(sk);
>>>>> +
>>>>> +                     /* Wait for driver to stop using the xdp socket. */
>>>>> +                     xdp_del_sk_umem(xs->umem, xs);
>>>>> +                     xs->dev = NULL;
>>>>> +                     synchronize_net();
>>>> Isn't this by handled by the unregister_count case below?
>>>>
>>>
>>> To clarify, setting dev to NULL and xdp_del_sk_umem() + sync makes
>>> sure that a driver doesn't touch the Tx and Rx rings. Nothing can be
>>> assumed about completion + fill ring (umem), until zero-copy has been
>>> disabled via ndo_bpf.
>>>
>>>>> +
>>>>> +                     /* Clear device references in umem. */
>>>>> +                     xdp_put_umem(xs->umem);
>>>>> +                     xs->umem = NULL;
>>>>
>>>> This makes me uneasy.  We need to unregister the umem from
>>>> the device (xdp_umem_clear_dev()) but this can remove the umem
>>>> pages out from underneath the xsk.
>>>>
>>>
>>> Yes, this is scary. The socket is alive, and userland typically has
>>> the fill/completion rings mmapped. Then the umem refcount is decreased
>>> and can potentially free the umem (fill rings etc.), as Jonathan says,
>>> underneath the xsk. Also, setting the xs umem/dev to zero, while the
>>> socket is alive, would allow a user to re-setup the socket, which we
>>> don't want to allow.
>>>
>>>> Perhaps what's needed here is the equivalent of an unbind()
>>>> call that just detaches the umem/sk from the device, but does
>>>> not otherwise tear them down.
>>>>
>>>
>>> Yeah, I agree. A detached/zombie state is needed during the socket lifetime.
>>
>>
>> I could try to rip the 'xdp_umem_release()' apart so the 'xdp_umem_clear_dev()'
>> could be called separately. This will allow to not tear down the 'umem'.
>> However, it seems that it'll not be easy to synchronize all parts.
>> Any suggestions are welcome.
>>
> 
> Thanks for continuing to work on this, Ilya.
> 
> What need to be done is exactly an "unbind()", i.e. returning the
> socket to the state prior bind, but disallowing any changes from
> userland (e.g. setsockopt/bind). So, unbind() + track that we're in
> "unbound" mode. :-) I think breaking up xdp_umem_release() is good way
> to go.

Thanks, I'll move in this direction.

BTW, I'll be out of office from tomorrow until the end of the week.
So, I'll most probably return to this on Monday.

> 
>> Also, there is no way to not clear the 'dev' as we have to put the reference.
>> Maybe we could add the additional check to 'xsk_bind()' for current device
>> state (dev->reg_state == NETREG_REGISTERED). This will allow us to avoid
>> re-setup of the socket.
>>
> 
> Yes, and also make sure that the rest of the syscall implementations
> don't allow for re-setup.

OK.

> 
> 
> Björn
> 
>>>
>>>>
>>>>> +                     mutex_unlock(&xs->mutex);
>>>>> +                     unregister_count++;
>>>>> +             }
>>>>> +             mutex_unlock(&net->xdp.lock);
>>>>> +
>>>>> +             if (unregister_count) {
>>>>> +                     /* Wait for umem clearing completion. */
>>>>> +                     synchronize_net();
>>>>> +                     for (i = 0; i < unregister_count; i++)
>>>>> +                             dev_put(dev);
>>>>> +             }
>>>>> +
>>>>> +             break;
>>>>> +     }
>>>>> +
>>>>> +     return NOTIFY_DONE;
>>>>> +}
>>>>> +
>>>>>  static struct proto xsk_proto = {
>>>>>       .name =         "XDP",
>>>>>       .owner =        THIS_MODULE,
>>>>> @@ -727,7 +778,8 @@ static void xsk_destruct(struct sock *sk)
>>>>>       if (!sock_flag(sk, SOCK_DEAD))
>>>>>               return;
>>>>>
>>>>> -     xdp_put_umem(xs->umem);
>>>>> +     if (xs->umem)
>>>>> +             xdp_put_umem(xs->umem);
>>>> Not needed - xdp_put_umem() already does a null check.
>>
>> Indeed. Thanks.
>>
>>>> --
>>>> Jonathan
>>>>
>>>>
>>>>>
>>>>>       sk_refcnt_debug_dec(sk);
>>>>>  }
>>>>> @@ -784,6 +836,10 @@ static const struct net_proto_family
>>>>> xsk_family_ops = {
>>>>>       .owner  = THIS_MODULE,
>>>>>  };
>>>>>
>>>>> +static struct notifier_block xsk_netdev_notifier = {
>>>>> +     .notifier_call  = xsk_notifier,
>>>>> +};
>>>>> +
>>>>>  static int __net_init xsk_net_init(struct net *net)
>>>>>  {
>>>>>       mutex_init(&net->xdp.lock);
>>>>> @@ -816,8 +872,15 @@ static int __init xsk_init(void)
>>>>>       err = register_pernet_subsys(&xsk_net_ops);
>>>>>       if (err)
>>>>>               goto out_sk;
>>>>> +
>>>>> +     err = register_netdevice_notifier(&xsk_netdev_notifier);
>>>>> +     if (err)
>>>>> +             goto out_pernet;
>>>>> +
>>>>>       return 0;
>>>>>
>>>>> +out_pernet:
>>>>> +     unregister_pernet_subsys(&xsk_net_ops);
>>>>>  out_sk:
>>>>>       sock_unregister(PF_XDP);
>>>>>  out_proto:
>>>>> --
>>>>> 2.17.1
>>>
>>>
> 
>
diff mbox series

Patch

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index a14e8864e4fa..273a419a8c4d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -693,6 +693,57 @@  static int xsk_mmap(struct file *file, struct socket *sock,
 			       size, vma->vm_page_prot);
 }
 
+static int xsk_notifier(struct notifier_block *this,
+			unsigned long msg, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct net *net = dev_net(dev);
+	int i, unregister_count = 0;
+	struct sock *sk;
+
+	switch (msg) {
+	case NETDEV_UNREGISTER:
+		mutex_lock(&net->xdp.lock);
+		sk_for_each(sk, &net->xdp.list) {
+			struct xdp_sock *xs = xdp_sk(sk);
+
+			mutex_lock(&xs->mutex);
+			if (dev != xs->dev) {
+				mutex_unlock(&xs->mutex);
+				continue;
+			}
+
+			sk->sk_err = ENETDOWN;
+			if (!sock_flag(sk, SOCK_DEAD))
+				sk->sk_error_report(sk);
+
+			/* Wait for driver to stop using the xdp socket. */
+			xdp_del_sk_umem(xs->umem, xs);
+			xs->dev = NULL;
+			synchronize_net();
+
+			/* Clear device references in umem. */
+			xdp_put_umem(xs->umem);
+			xs->umem = NULL;
+
+			mutex_unlock(&xs->mutex);
+			unregister_count++;
+		}
+		mutex_unlock(&net->xdp.lock);
+
+		if (unregister_count) {
+			/* Wait for umem clearing completion. */
+			synchronize_net();
+			for (i = 0; i < unregister_count; i++)
+				dev_put(dev);
+		}
+
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
 static struct proto xsk_proto = {
 	.name =		"XDP",
 	.owner =	THIS_MODULE,
@@ -727,7 +778,8 @@  static void xsk_destruct(struct sock *sk)
 	if (!sock_flag(sk, SOCK_DEAD))
 		return;
 
-	xdp_put_umem(xs->umem);
+	if (xs->umem)
+		xdp_put_umem(xs->umem);
 
 	sk_refcnt_debug_dec(sk);
 }
@@ -784,6 +836,10 @@  static const struct net_proto_family xsk_family_ops = {
 	.owner	= THIS_MODULE,
 };
 
+static struct notifier_block xsk_netdev_notifier = {
+	.notifier_call	= xsk_notifier,
+};
+
 static int __net_init xsk_net_init(struct net *net)
 {
 	mutex_init(&net->xdp.lock);
@@ -816,8 +872,15 @@  static int __init xsk_init(void)
 	err = register_pernet_subsys(&xsk_net_ops);
 	if (err)
 		goto out_sk;
+
+	err = register_netdevice_notifier(&xsk_netdev_notifier);
+	if (err)
+		goto out_pernet;
+
 	return 0;
 
+out_pernet:
+	unregister_pernet_subsys(&xsk_net_ops);
 out_sk:
 	sock_unregister(PF_XDP);
 out_proto: