Message ID | 1369739242-5944-1-git-send-email-amwang@redhat.com |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
On Tue, 28 May 2013 19:07:22 +0800 Cong Wang <amwang@redhat.com> wrote: > From: Cong Wang <amwang@redhat.com> > > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports), > we use kfree_rcu() to free ->vn_sock, but a) there is no use > of RCU API to access this filed, b) RCU is not enough to do refcnt > here, because in vxlan_leave_group() we drop RTNL lock before > locking the socket, it could be possible that this field is > freed during this period. > > So, instead making things complex, just do basic refcnt for > the ->vn_sock, like we do for others. > > Cc: Stephen Hemminger <stephen@networkplumber.org> > Cc: David S. Miller <davem@davemloft.net> > Signed-off-by: Cong Wang <amwang@redhat.com> > > --- > diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c > index 5ed64d4..73a674e 100644 > --- a/drivers/net/vxlan.c > +++ b/drivers/net/vxlan.c > @@ -83,9 +83,8 @@ static unsigned int vxlan_net_id; > /* per UDP socket information */ > struct vxlan_sock { > struct hlist_node hlist; > - struct rcu_head rcu; > struct work_struct del_work; > - unsigned int refcnt; > + atomic_t refcnt; > struct socket *sock; > struct hlist_head vni_list[VNI_HASH_SIZE]; > }; > @@ -164,14 +163,30 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port) > return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; > } > > +static inline void vxlan_sock_get(struct vxlan_sock *vs) > +{ > + atomic_inc(&vs->refcnt); > +} > + > +static inline void vxlan_sock_put(struct vxlan_sock *vs) > +{ > + if (atomic_dec_and_test(&vs->refcnt)) { > + hlist_del_rcu(&vs->hlist); > + schedule_work(&vs->del_work); > + } > +} > + > + > /* Find VXLAN socket based on network namespace and UDP port */ > static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port) > { > struct vxlan_sock *vs; > > hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { > + vxlan_sock_get(vs); > if (inet_sk(vs->sock->sk)->inet_sport == port) > return vs; > + vxlan_sock_put(vs); > } > return NULL; > } > @@ -187,10 +202,13 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port) > return NULL; > > hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) { > - if (vxlan->default_dst.remote_vni == id) > + if (vxlan->default_dst.remote_vni == id) { > + vxlan_sock_put(vs); > return vxlan; > + } > } > > + vxlan_sock_put(vs); > return NULL; > } > > @@ -926,7 +944,7 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) > return false; > } > > -static void vxlan_sock_put(struct sk_buff *skb) > +static void vxlan_sock_destructor(struct sk_buff *skb) > { > sock_put(skb->sk); > } > @@ -940,7 +958,7 @@ static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb) > skb_orphan(skb); > sock_hold(sk); > skb->sk = sk; > - skb->destructor = vxlan_sock_put; > + skb->destructor = vxlan_sock_destructor; > } > > /* Compute source port for outgoing packet > @@ -1255,10 +1273,14 @@ static int vxlan_open(struct net_device *dev) > struct vxlan_dev *vxlan = netdev_priv(dev); > int err; > > + vxlan_sock_get(vxlan->vn_sock); > + > if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { > err = vxlan_join_group(dev); > - if (err) > + if (err) { > + vxlan_sock_put(vxlan->vn_sock); > return err; > + } > } > > if (vxlan->age_interval) > @@ -1294,6 +1316,8 @@ static int vxlan_stop(struct net_device *dev) > > del_timer_sync(&vxlan->age_timer); > > + vxlan_sock_put(vxlan->vn_sock); > + > vxlan_flush(vxlan); > > return 0; > @@ -1447,7 +1471,7 @@ static void vxlan_del_work(struct work_struct *work) > struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work); > > sk_release_kernel(vs->sock->sk); > - kfree_rcu(vs, rcu); > + kfree(vs); > } > > /* Create new listen socket if needed */ > @@ -1503,7 +1527,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) > udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; > udp_encap_enable(); > > - vs->refcnt = 1; > + atomic_set(&vs->refcnt, 1); > return vs; > } > > @@ -1592,9 +1616,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, > } > > vs = vxlan_find_port(net, vxlan->dst_port); > - if (vs) > - ++vs->refcnt; > - else { > + if (!vs) { > /* Drop lock because socket create acquires RTNL lock */ > rtnl_unlock(); > vs = vxlan_socket_create(net, vxlan->dst_port); > @@ -1610,12 +1632,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, > > err = register_netdevice(dev); > if (err) { > - if (--vs->refcnt == 0) { > - rtnl_unlock(); > - sk_release_kernel(vs->sock->sk); > - kfree(vs); > - rtnl_lock(); > - } > + vxlan_sock_put(vs); > return err; > } > > @@ -1634,10 +1651,7 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head) > list_del(&vxlan->next); > unregister_netdevice_queue(dev, head); > > - if (--vs->refcnt == 0) { > - hlist_del_rcu(&vs->hlist); > - schedule_work(&vs->del_work); > - } > + vxlan_sock_put(vs); > } > > static size_t vxlan_get_size(const struct net_device *dev) Not needed all access is under RTNL -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2013-05-28 at 08:22 -0700, Stephen Hemminger wrote: > On Tue, 28 May 2013 19:07:22 +0800 > Cong Wang <amwang@redhat.com> wrote: > > > From: Cong Wang <amwang@redhat.com> > > > > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports), > > we use kfree_rcu() to free ->vn_sock, but a) there is no use > > of RCU API to access this filed, b) RCU is not enough to do refcnt > > here, because in vxlan_leave_group() we drop RTNL lock before > > locking the socket, it could be possible that this field is > > freed during this period. > > > > So, instead making things complex, just do basic refcnt for > > the ->vn_sock, like we do for others. > > ... > > Not needed all access is under RTNL I know, this is why I had a patch (not posted) which adds the missing rtnl_dereference(), but even if we had these, it is still not correct. As I explained in the changelog, vxlan_leave_group() has a problem, because it releases rtnl lock before locking the socket, _and_ it is called after vxlan_dellink() which schedules a work to cleanup the struct. Therefore the ->vn_sock could be freed right after rtnl lock is released. Am I miss anything? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2013-05-28 at 19:07 +0800, Cong Wang wrote: > From: Cong Wang <amwang@redhat.com> > > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports), > we use kfree_rcu() to free ->vn_sock, but a) there is no use > of RCU API to access this filed, b) RCU is not enough to do refcnt > here, because in vxlan_leave_group() we drop RTNL lock before > locking the socket, it could be possible that this field is > freed during this period. > > So, instead making things complex, just do basic refcnt for > the ->vn_sock, like we do for others. BTW, this patch fixes a real crash when I test vxlan over IPv6, although I can't reproduce it with IPv4. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 29 May 2013 10:08:53 +0800 Cong Wang <amwang@redhat.com> wrote: > On Tue, 2013-05-28 at 08:22 -0700, Stephen Hemminger wrote: > > On Tue, 28 May 2013 19:07:22 +0800 > > Cong Wang <amwang@redhat.com> wrote: > > > > > From: Cong Wang <amwang@redhat.com> > > > > > > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports), > > > we use kfree_rcu() to free ->vn_sock, but a) there is no use > > > of RCU API to access this filed, b) RCU is not enough to do refcnt > > > here, because in vxlan_leave_group() we drop RTNL lock before > > > locking the socket, it could be possible that this field is > > > freed during this period. > > > > > > So, instead making things complex, just do basic refcnt for > > > the ->vn_sock, like we do for others. > > > > ... > > > > Not needed all access is under RTNL > > I know, this is why I had a patch (not posted) which adds the missing > rtnl_dereference(), but even if we had these, it is still not correct. > > As I explained in the changelog, vxlan_leave_group() has a problem, > because it releases rtnl lock before locking the socket, _and_ it is > called after vxlan_dellink() which schedules a work to cleanup the > struct. Therefore the ->vn_sock could be freed right after rtnl lock is > released. > > Am I miss anything? Ignoring your IPv6 code for now... With IPV4: refcnt is incremented when socket is incremented in newlink (RTNL held). refcnt is decremented in by dellink (RTNL held) and socket is deleted from list leave_group doesn't happen until work queue is fired. rtnl_dereference is fine, but hardly necessary when the call hierarchy is so obvious. The problem you describe won't be fixed by just converting it to atomic, I think you need add a dev_hold()/dev_put to vxlan_stop to prevent device from being deleted when rtnl_lock is dropped. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2013-05-28 at 21:22 -0700, Stephen Hemminger wrote: > On Wed, 29 May 2013 10:08:53 +0800 > Cong Wang <amwang@redhat.com> wrote: > > > On Tue, 2013-05-28 at 08:22 -0700, Stephen Hemminger wrote: > > > On Tue, 28 May 2013 19:07:22 +0800 > > > Cong Wang <amwang@redhat.com> wrote: > > > > > > > From: Cong Wang <amwang@redhat.com> > > > > > > > > In commit 553675fb5e9ce3d71a (vxlan: listen on multiple ports), > > > > we use kfree_rcu() to free ->vn_sock, but a) there is no use > > > > of RCU API to access this filed, b) RCU is not enough to do refcnt > > > > here, because in vxlan_leave_group() we drop RTNL lock before > > > > locking the socket, it could be possible that this field is > > > > freed during this period. > > > > > > > > So, instead making things complex, just do basic refcnt for > > > > the ->vn_sock, like we do for others. > > > > > > ... > > > > > > Not needed all access is under RTNL > > > > I know, this is why I had a patch (not posted) which adds the missing > > rtnl_dereference(), but even if we had these, it is still not correct. > > > > As I explained in the changelog, vxlan_leave_group() has a problem, > > because it releases rtnl lock before locking the socket, _and_ it is > > called after vxlan_dellink() which schedules a work to cleanup the > > struct. Therefore the ->vn_sock could be freed right after rtnl lock is > > released. > > > > Am I miss anything? > > Ignoring your IPv6 code for now... > > With IPV4: > refcnt is incremented when socket is incremented in newlink (RTNL held). > refcnt is decremented in by dellink (RTNL held) and socket is deleted from list > leave_group doesn't happen until work queue is fired. > > rtnl_dereference is fine, but hardly necessary when the call hierarchy is so obvious. > > The problem you describe won't be fixed by just converting it to atomic, My patch is _not_ just converting it to atomic_t, but it takes a ref for every usage of ->vn_sock, which current implementation misses. > I think you need add a dev_hold()/dev_put to vxlan_stop to prevent > device from being deleted when rtnl_lock is dropped. > The crash is that lock_sock() got a NULL-def bug, which is not the related with dev_hold() at all. I think it is due to the whole ->vn_sock is freed before calling lock_sock(), thus vxlan->vn_sock->sock->sk points to a freed memory area. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 5ed64d4..73a674e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -83,9 +83,8 @@ static unsigned int vxlan_net_id; /* per UDP socket information */ struct vxlan_sock { struct hlist_node hlist; - struct rcu_head rcu; struct work_struct del_work; - unsigned int refcnt; + atomic_t refcnt; struct socket *sock; struct hlist_head vni_list[VNI_HASH_SIZE]; }; @@ -164,14 +163,30 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port) return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; } +static inline void vxlan_sock_get(struct vxlan_sock *vs) +{ + atomic_inc(&vs->refcnt); +} + +static inline void vxlan_sock_put(struct vxlan_sock *vs) +{ + if (atomic_dec_and_test(&vs->refcnt)) { + hlist_del_rcu(&vs->hlist); + schedule_work(&vs->del_work); + } +} + + /* Find VXLAN socket based on network namespace and UDP port */ static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port) { struct vxlan_sock *vs; hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { + vxlan_sock_get(vs); if (inet_sk(vs->sock->sk)->inet_sport == port) return vs; + vxlan_sock_put(vs); } return NULL; } @@ -187,10 +202,13 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port) return NULL; hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) { - if (vxlan->default_dst.remote_vni == id) + if (vxlan->default_dst.remote_vni == id) { + vxlan_sock_put(vs); return vxlan; + } } + vxlan_sock_put(vs); return NULL; } @@ -926,7 +944,7 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) return false; } -static void vxlan_sock_put(struct sk_buff *skb) +static void vxlan_sock_destructor(struct sk_buff *skb) { sock_put(skb->sk); } @@ -940,7 +958,7 @@ static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb) skb_orphan(skb); sock_hold(sk); skb->sk = sk; - skb->destructor = vxlan_sock_put; + skb->destructor = vxlan_sock_destructor; } /* Compute source port for outgoing packet @@ -1255,10 +1273,14 @@ static int vxlan_open(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); int err; + vxlan_sock_get(vxlan->vn_sock); + if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) { err = vxlan_join_group(dev); - if (err) + if (err) { + vxlan_sock_put(vxlan->vn_sock); return err; + } } if (vxlan->age_interval) @@ -1294,6 +1316,8 @@ static int vxlan_stop(struct net_device *dev) del_timer_sync(&vxlan->age_timer); + vxlan_sock_put(vxlan->vn_sock); + vxlan_flush(vxlan); return 0; @@ -1447,7 +1471,7 @@ static void vxlan_del_work(struct work_struct *work) struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work); sk_release_kernel(vs->sock->sk); - kfree_rcu(vs, rcu); + kfree(vs); } /* Create new listen socket if needed */ @@ -1503,7 +1527,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port) udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; udp_encap_enable(); - vs->refcnt = 1; + atomic_set(&vs->refcnt, 1); return vs; } @@ -1592,9 +1616,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, } vs = vxlan_find_port(net, vxlan->dst_port); - if (vs) - ++vs->refcnt; - else { + if (!vs) { /* Drop lock because socket create acquires RTNL lock */ rtnl_unlock(); vs = vxlan_socket_create(net, vxlan->dst_port); @@ -1610,12 +1632,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, err = register_netdevice(dev); if (err) { - if (--vs->refcnt == 0) { - rtnl_unlock(); - sk_release_kernel(vs->sock->sk); - kfree(vs); - rtnl_lock(); - } + vxlan_sock_put(vs); return err; } @@ -1634,10 +1651,7 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head) list_del(&vxlan->next); unregister_netdevice_queue(dev, head); - if (--vs->refcnt == 0) { - hlist_del_rcu(&vs->hlist); - schedule_work(&vs->del_work); - } + vxlan_sock_put(vs); } static size_t vxlan_get_size(const struct net_device *dev)