Message ID | 1450398593-14838-1-git-send-email-tim.gardner@canonical.com |
---|---|
State | New |
Headers | show |
Cherry pick and positive testing.
On Thu, Dec 17, 2015 at 05:29:53PM -0700, tim.gardner@canonical.com wrote: > From: Eric Dumazet <edumazet@google.com> > > BugLink: http://bugs.launchpad.net/bugs/1526946 > > David Wilder reported crashes caused by dst reuse. > > <quote David> > I am seeing a crash on a distro V4.2.3 kernel caused by a double > release of a dst_entry. In ipv4_dst_destroy() the call to > list_empty() finds a poisoned next pointer, indicating the dst_entry > has already been removed from the list and freed. The crash occurs > 18 to 24 hours into a run of a network stress exerciser. > </quote> > > Thanks to his detailed report and analysis, we were able to understand > the core issue. > > IP early demux can associate a dst to skb, after a lookup in TCP/UDP > sockets. > > When socket cache is not properly set, we want to store into > sk->sk_dst_cache the dst for future IP early demux lookups, > by acquiring a stable refcount on the dst. > > Problem is this acquisition is simply using an atomic_inc(), > which works well, unless the dst was queued for destruction from > dst_release() noticing dst refcount went to zero, if DST_NOCACHE > was set on dst. > > We need to make sure current refcount is not zero before incrementing > it, or risk double free as David reported. > > This patch, being a stable candidate, adds two new helpers, and use > them only from IP early demux problematic paths. > > It might be possible to merge in net-next skb_dst_force() and > skb_dst_force_safe(), but I prefer having the smallest patch for stable > kernels : Maybe some skb_dst_force() callers do not expect skb->dst > can suddenly be cleared. > > Can probably be backported back to linux-3.6 kernels > > Reported-by: David J. Wilder <dwilder@us.ibm.com> > Tested-by: David J. Wilder <dwilder@us.ibm.com> > Signed-off-by: Eric Dumazet <edumazet@google.com> > Signed-off-by: David S. Miller <davem@davemloft.net> > (cherry picked from linux-next commit 5037e9ef9454917b047f9f3a19b4dd179fbf7cd4) > Signed-off-by: Tim Gardner <tim.gardner@canonical.com> > --- > include/net/dst.h | 33 +++++++++++++++++++++++++++++++++ > include/net/sock.h | 2 +- > net/ipv4/tcp_ipv4.c | 5 ++--- > net/ipv6/tcp_ipv6.c | 3 +-- > 4 files changed, 37 insertions(+), 6 deletions(-) > > diff --git a/include/net/dst.h b/include/net/dst.h > index 2bc73f8a..c34b277 100644 > --- a/include/net/dst.h > +++ b/include/net/dst.h > @@ -306,6 +306,39 @@ static inline void skb_dst_force(struct sk_buff *skb) > } > } > > +/** > + * dst_hold_safe - Take a reference on a dst if possible > + * @dst: pointer to dst entry > + * > + * This helper returns false if it could not safely > + * take a reference on a dst. > + */ > +static inline bool dst_hold_safe(struct dst_entry *dst) > +{ > + if (dst->flags & DST_NOCACHE) > + return atomic_inc_not_zero(&dst->__refcnt); > + dst_hold(dst); > + return true; > +} > + > +/** > + * skb_dst_force_safe - makes sure skb dst is refcounted > + * @skb: buffer > + * > + * If dst is not yet refcounted and not destroyed, grab a ref on it. > + */ > +static inline void skb_dst_force_safe(struct sk_buff *skb) > +{ > + if (skb_dst_is_noref(skb)) { > + struct dst_entry *dst = skb_dst(skb); > + > + if (!dst_hold_safe(dst)) > + dst = NULL; > + > + skb->_skb_refdst = (unsigned long)dst; > + } > +} > + > > /** > * __skb_tunnel_rx - prepare skb for rx reinsert > diff --git a/include/net/sock.h b/include/net/sock.h > index 4ca4c3f..952e780 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -796,7 +796,7 @@ void sk_stream_write_space(struct sock *sk); > static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) > { > /* dont let skb dst not refcounted, we are going to leave rcu lock */ > - skb_dst_force(skb); > + skb_dst_force_safe(skb); > > if (!sk->sk_backlog.tail) > sk->sk_backlog.head = skb; > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c > index 0ea2e1c..4d37075 100644 > --- a/net/ipv4/tcp_ipv4.c > +++ b/net/ipv4/tcp_ipv4.c > @@ -1508,7 +1508,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) > if (likely(sk->sk_rx_dst)) > skb_dst_drop(skb); > else > - skb_dst_force(skb); > + skb_dst_force_safe(skb); > > __skb_queue_tail(&tp->ucopy.prequeue, skb); > tp->ucopy.memory += skb->truesize; > @@ -1710,8 +1710,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) > { > struct dst_entry *dst = skb_dst(skb); > > - if (dst) { > - dst_hold(dst); > + if (dst && dst_hold_safe(dst)) { > sk->sk_rx_dst = dst; > inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; > } > diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c > index 7a6cea5..403cb24 100644 > --- a/net/ipv6/tcp_ipv6.c > +++ b/net/ipv6/tcp_ipv6.c > @@ -93,10 +93,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) > { > struct dst_entry *dst = skb_dst(skb); > > - if (dst) { > + if (dst && dst_hold_safe(dst)) { > const struct rt6_info *rt = (const struct rt6_info *)dst; > > - dst_hold(dst); > sk->sk_rx_dst = dst; > inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; > inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); Looks to do what is claimed. Clean cherrypick. Acked-by: Andy Whitcroft <apw@canonical.com> -apw
On Thu, 2015-12-17 at 17:29 -0700, tim.gardner@canonical.com wrote: > From: Eric Dumazet <edumazet@google.com> > > BugLink: http://bugs.launchpad.net/bugs/1526946 > > David Wilder reported crashes caused by dst reuse. > > <quote David> > I am seeing a crash on a distro V4.2.3 kernel caused by a double > release of a dst_entry. In ipv4_dst_destroy() the call to > list_empty() finds a poisoned next pointer, indicating the dst_entry > has already been removed from the list and freed. The crash occurs > 18 to 24 hours into a run of a network stress exerciser. > </quote> > > Thanks to his detailed report and analysis, we were able to understand > the core issue. > > IP early demux can associate a dst to skb, after a lookup in TCP/UDP > sockets. > > When socket cache is not properly set, we want to store into > sk->sk_dst_cache the dst for future IP early demux lookups, > by acquiring a stable refcount on the dst. > > Problem is this acquisition is simply using an atomic_inc(), > which works well, unless the dst was queued for destruction from > dst_release() noticing dst refcount went to zero, if DST_NOCACHE > was set on dst. > > We need to make sure current refcount is not zero before incrementing > it, or risk double free as David reported. > > This patch, being a stable candidate, adds two new helpers, and use > them only from IP early demux problematic paths. > > It might be possible to merge in net-next skb_dst_force() and > skb_dst_force_safe(), but I prefer having the smallest patch for stable > kernels : Maybe some skb_dst_force() callers do not expect skb->dst > can suddenly be cleared. > > Can probably be backported back to linux-3.6 kernels > > Reported-by: David J. Wilder <dwilder@us.ibm.com> > Tested-by: David J. Wilder <dwilder@us.ibm.com> > Signed-off-by: Eric Dumazet <edumazet@google.com> > Signed-off-by: David S. Miller <davem@davemloft.net> > (cherry picked from linux-next commit 5037e9ef9454917b047f9f3a19b4dd179fbf7cd4) > Signed-off-by: Tim Gardner <tim.gardner@canonical.com> > --- > include/net/dst.h | 33 +++++++++++++++++++++++++++++++++ > include/net/sock.h | 2 +- > net/ipv4/tcp_ipv4.c | 5 ++--- > net/ipv6/tcp_ipv6.c | 3 +-- > 4 files changed, 37 insertions(+), 6 deletions(-) > > diff --git a/include/net/dst.h b/include/net/dst.h > index 2bc73f8a..c34b277 100644 > --- a/include/net/dst.h > +++ b/include/net/dst.h > @@ -306,6 +306,39 @@ static inline void skb_dst_force(struct sk_buff *skb) > } > } > > +/** > + * dst_hold_safe - Take a reference on a dst if possible > + * @dst: pointer to dst entry > + * > + * This helper returns false if it could not safely > + * take a reference on a dst. > + */ > +static inline bool dst_hold_safe(struct dst_entry *dst) > +{ > + if (dst->flags & DST_NOCACHE) > + return atomic_inc_not_zero(&dst->__refcnt); > + dst_hold(dst); > + return true; > +} > + > +/** > + * skb_dst_force_safe - makes sure skb dst is refcounted > + * @skb: buffer > + * > + * If dst is not yet refcounted and not destroyed, grab a ref on it. > + */ > +static inline void skb_dst_force_safe(struct sk_buff *skb) > +{ > + if (skb_dst_is_noref(skb)) { > + struct dst_entry *dst = skb_dst(skb); > + > + if (!dst_hold_safe(dst)) > + dst = NULL; > + > + skb->_skb_refdst = (unsigned long)dst; > + } > +} > + > > /** > * __skb_tunnel_rx - prepare skb for rx reinsert > diff --git a/include/net/sock.h b/include/net/sock.h > index 4ca4c3f..952e780 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -796,7 +796,7 @@ void sk_stream_write_space(struct sock *sk); > static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) > { > /* dont let skb dst not refcounted, we are going to leave rcu lock */ > - skb_dst_force(skb); > + skb_dst_force_safe(skb); > > if (!sk->sk_backlog.tail) > sk->sk_backlog.head = skb; > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c > index 0ea2e1c..4d37075 100644 > --- a/net/ipv4/tcp_ipv4.c > +++ b/net/ipv4/tcp_ipv4.c > @@ -1508,7 +1508,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) > if (likely(sk->sk_rx_dst)) > skb_dst_drop(skb); > else > - skb_dst_force(skb); > + skb_dst_force_safe(skb); > > __skb_queue_tail(&tp->ucopy.prequeue, skb); > tp->ucopy.memory += skb->truesize; > @@ -1710,8 +1710,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) > { > struct dst_entry *dst = skb_dst(skb); > > - if (dst) { > - dst_hold(dst); > + if (dst && dst_hold_safe(dst)) { > sk->sk_rx_dst = dst; > inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; > } > diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c > index 7a6cea5..403cb24 100644 > --- a/net/ipv6/tcp_ipv6.c > +++ b/net/ipv6/tcp_ipv6.c > @@ -93,10 +93,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) > { > struct dst_entry *dst = skb_dst(skb); > > - if (dst) { > + if (dst && dst_hold_safe(dst)) { > const struct rt6_info *rt = (const struct rt6_info *)dst; > > - dst_hold(dst); > sk->sk_rx_dst = dst; > inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; > inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); > -- > 1.9.1 > >
diff --git a/include/net/dst.h b/include/net/dst.h index 2bc73f8a..c34b277 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -306,6 +306,39 @@ static inline void skb_dst_force(struct sk_buff *skb) } } +/** + * dst_hold_safe - Take a reference on a dst if possible + * @dst: pointer to dst entry + * + * This helper returns false if it could not safely + * take a reference on a dst. + */ +static inline bool dst_hold_safe(struct dst_entry *dst) +{ + if (dst->flags & DST_NOCACHE) + return atomic_inc_not_zero(&dst->__refcnt); + dst_hold(dst); + return true; +} + +/** + * skb_dst_force_safe - makes sure skb dst is refcounted + * @skb: buffer + * + * If dst is not yet refcounted and not destroyed, grab a ref on it. + */ +static inline void skb_dst_force_safe(struct sk_buff *skb) +{ + if (skb_dst_is_noref(skb)) { + struct dst_entry *dst = skb_dst(skb); + + if (!dst_hold_safe(dst)) + dst = NULL; + + skb->_skb_refdst = (unsigned long)dst; + } +} + /** * __skb_tunnel_rx - prepare skb for rx reinsert diff --git a/include/net/sock.h b/include/net/sock.h index 4ca4c3f..952e780 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -796,7 +796,7 @@ void sk_stream_write_space(struct sock *sk); static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { /* dont let skb dst not refcounted, we are going to leave rcu lock */ - skb_dst_force(skb); + skb_dst_force_safe(skb); if (!sk->sk_backlog.tail) sk->sk_backlog.head = skb; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0ea2e1c..4d37075 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1508,7 +1508,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) if (likely(sk->sk_rx_dst)) skb_dst_drop(skb); else - skb_dst_force(skb); + skb_dst_force_safe(skb); __skb_queue_tail(&tp->ucopy.prequeue, skb); tp->ucopy.memory += skb->truesize; @@ -1710,8 +1710,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - if (dst) { - dst_hold(dst); + if (dst && dst_hold_safe(dst)) { sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7a6cea5..403cb24 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -93,10 +93,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - if (dst) { + if (dst && dst_hold_safe(dst)) { const struct rt6_info *rt = (const struct rt6_info *)dst; - dst_hold(dst); sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);