Message ID | 1345372308.5158.54.camel@edumazet-glaptop |
---|---|
State | Accepted, archived |
Delegated to: | David Miller |
Headers | show |
Hi Eric Please correct me if I'm wrong about below comments. On 2012年08月19日 18:31, Eric Dumazet wrote: > From: Eric Dumazet<edumazet@google.com> > > This patch reverts commit 56892261ed1a (xfrm: Use rcu_dereference_bh to > deference pointer protected by rcu_read_lock_bh), and fixes bugs > introduced in commit 418a99ac6ad ( Replace rwlock on xfrm_policy_afinfo > with rcu ) > > 1) We properly use RCU variant in this file, not a mix of RCU/RCU_BH > > 2) We must defer some writes after the synchronize_rcu() call or a reader > can crash dereferencing NULL pointer. Not exactly. net/ipv4/xfrm4_policy.c static void __exit xfrm4_policy_fini(void) -> xfrm_policy_unregister_afinfo IMHO, ip stack can never be compiled as module, so is xfrm4_policy_fini freed up after system bootup? which means xfrm4_policy_fini can never be called. so an dereferencing NULL pointer by a reader could not happen. > > 3) Now we use the xfrm_policy_afinfo_lock spinlock only from process > context, we no longer need to block BH in xfrm_policy_register_afinfo() > and xfrm_policy_unregister_afinfo() > I don't think it's related to what kinds of locks we are using. we call xfrm_policy_register_afinfo in process context, but actually what xfrm_policy_afinfo_lock protected can be used in soft irq context. that's why xx_bh is used in: e959d812 " [XFRM]: fix incorrect xfrm_policy_afinfo_lock use" Is such scenario still valid? > 4) Can use RCU_INIT_POINTER() instead of rcu_assign_pointer() in > xfrm_policy_unregister_afinfo() > > 5) Remove a forward inline declaration (xfrm_policy_put_afinfo()), > and also move xfrm_policy_get_afinfo() declaration. > > Signed-off-by: Eric Dumazet<edumazet@google.com> > Cc: Fan Du<fan.du@windriver.com> > Cc: Priyanka Jain<Priyanka.Jain@freescale.com> > --- > net/xfrm/xfrm_policy.c | 76 ++++++++++++++++++++------------------- > 1 file changed, 39 insertions(+), 37 deletions(-) > > diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c > index 6405764..e52f50f 100644 > --- a/net/xfrm/xfrm_policy.c > +++ b/net/xfrm/xfrm_policy.c > @@ -48,8 +48,6 @@ static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO] > > static struct kmem_cache *xfrm_dst_cache __read_mostly; > > -static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); > -static inline void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); > static void xfrm_init_pmtu(struct dst_entry *dst); > static int stale_bundle(struct dst_entry *dst); > static int xfrm_bundle_ok(struct xfrm_dst *xdst); > @@ -96,6 +94,24 @@ bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl > return false; > } > > +static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) > +{ > + struct xfrm_policy_afinfo *afinfo; > + > + if (unlikely(family>= NPROTO)) > + return NULL; > + rcu_read_lock(); > + afinfo = rcu_dereference(xfrm_policy_afinfo[family]); > + if (unlikely(!afinfo)) > + rcu_read_unlock(); > + return afinfo; > +} > + > +static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) > +{ > + rcu_read_unlock(); > +} > + > static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, > const xfrm_address_t *saddr, > const xfrm_address_t *daddr, > @@ -2419,7 +2435,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) > return -EINVAL; > if (unlikely(afinfo->family>= NPROTO)) > return -EAFNOSUPPORT; > - spin_lock_bh(&xfrm_policy_afinfo_lock); > + spin_lock(&xfrm_policy_afinfo_lock); > if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL)) > err = -ENOBUFS; > else { > @@ -2442,7 +2458,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) > afinfo->garbage_collect = xfrm_garbage_collect_deferred; > rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo); > } > - spin_unlock_bh(&xfrm_policy_afinfo_lock); > + spin_unlock(&xfrm_policy_afinfo_lock); > > rtnl_lock(); > for_each_net(net) { > @@ -2475,23 +2491,26 @@ int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) > return -EINVAL; > if (unlikely(afinfo->family>= NPROTO)) > return -EAFNOSUPPORT; > - spin_lock_bh(&xfrm_policy_afinfo_lock); > + spin_lock(&xfrm_policy_afinfo_lock); > if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) { > if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo)) > err = -EINVAL; > - else { > - struct dst_ops *dst_ops = afinfo->dst_ops; > - rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], > - NULL); > - dst_ops->kmem_cachep = NULL; > - dst_ops->check = NULL; > - dst_ops->negative_advice = NULL; > - dst_ops->link_failure = NULL; > - afinfo->garbage_collect = NULL; > - } > + else > + RCU_INIT_POINTER(xfrm_policy_afinfo[afinfo->family], > + NULL); > + } > + spin_unlock(&xfrm_policy_afinfo_lock); > + if (!err) { > + struct dst_ops *dst_ops = afinfo->dst_ops; > + > + synchronize_rcu(); > + > + dst_ops->kmem_cachep = NULL; > + dst_ops->check = NULL; > + dst_ops->negative_advice = NULL; > + dst_ops->link_failure = NULL; > + afinfo->garbage_collect = NULL; > } > - spin_unlock_bh(&xfrm_policy_afinfo_lock); > - synchronize_rcu(); > return err; > } > EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); > @@ -2500,32 +2519,15 @@ static void __net_init xfrm_dst_ops_init(struct net *net) > { > struct xfrm_policy_afinfo *afinfo; > > - rcu_read_lock_bh(); > - afinfo = rcu_dereference_bh(xfrm_policy_afinfo[AF_INET]); > + rcu_read_lock(); > + afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET]); > if (afinfo) > net->xfrm.xfrm4_dst_ops = *afinfo->dst_ops; > #if IS_ENABLED(CONFIG_IPV6) > - afinfo = rcu_dereference_bh(xfrm_policy_afinfo[AF_INET6]); > + afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET6]); > if (afinfo) > net->xfrm.xfrm6_dst_ops = *afinfo->dst_ops; > #endif > - rcu_read_unlock_bh(); > -} > - > -static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) > -{ > - struct xfrm_policy_afinfo *afinfo; > - if (unlikely(family>= NPROTO)) > - return NULL; > - rcu_read_lock(); > - afinfo = rcu_dereference(xfrm_policy_afinfo[family]); > - if (unlikely(!afinfo)) > - rcu_read_unlock(); > - return afinfo; > -} > - > -static inline void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) > -{ > rcu_read_unlock(); > } > > > >
On Mon, 2012-08-20 at 12:40 +0800, Fan Du wrote: > Hi Eric > > Please correct me if I'm wrong about below comments. > > On 2012年08月19日 18:31, Eric Dumazet wrote: > > From: Eric Dumazet<edumazet@google.com> > > > > This patch reverts commit 56892261ed1a (xfrm: Use rcu_dereference_bh to > > deference pointer protected by rcu_read_lock_bh), and fixes bugs > > introduced in commit 418a99ac6ad ( Replace rwlock on xfrm_policy_afinfo > > with rcu ) > > > > 1) We properly use RCU variant in this file, not a mix of RCU/RCU_BH > > > > 2) We must defer some writes after the synchronize_rcu() call or a reader > > can crash dereferencing NULL pointer. > > Not exactly. > > net/ipv4/xfrm4_policy.c > static void __exit xfrm4_policy_fini(void) > -> xfrm_policy_unregister_afinfo > > IMHO, ip stack can never be compiled as module, so is xfrm4_policy_fini > freed up after system bootup? which means xfrm4_policy_fini can never be > called. > > so an dereferencing NULL pointer by a reader could not happen. > Last famous words. Anyway xfrm_policy_unregister_afinfo() is also called from xfrm6_policy_fini(), and IPv6 is a module. The day we can rmmod it, we uncover this bug. RCU is complex (most people dont get it right, thats the truth), and we should make it rock solid, or I can guarantee you many patch attempts from future readers of this code. You wont tell them : "OK but dont worry we never call this function for real, why do you care at all" > > > > 3) Now we use the xfrm_policy_afinfo_lock spinlock only from process > > context, we no longer need to block BH in xfrm_policy_register_afinfo() > > and xfrm_policy_unregister_afinfo() > > > I don't think it's related to what kinds of locks we are using. > we call xfrm_policy_register_afinfo in process context, but actually > what xfrm_policy_afinfo_lock protected can be used in soft irq context. > that's why xx_bh is used in: You did an RCU conversion and obviously have little idea of what happened there. This _bh stuff was needed because _before_ RCU, an rwlock was used. And since read_lock() was used from BH handler, _all_ write_lock() had to use the write_lock_bh() variant to avoid a possible deadlock. But after RCU, this no longer is needed, as an rcu_read_lock() cannot block a writer anymore in the lock/unlock section. In fact, xfrm_policy_afinfo_lock could be replaced by a mutex. So _bh() is absolutely not needed anymore. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2012年08月20日 13:33, Eric Dumazet wrote: > On Mon, 2012-08-20 at 12:40 +0800, Fan Du wrote: >> Hi Eric >> >> Please correct me if I'm wrong about below comments. >> >> On 2012年08月19日 18:31, Eric Dumazet wrote: >>> From: Eric Dumazet<edumazet@google.com> >>> >>> This patch reverts commit 56892261ed1a (xfrm: Use rcu_dereference_bh to >>> deference pointer protected by rcu_read_lock_bh), and fixes bugs >>> introduced in commit 418a99ac6ad ( Replace rwlock on xfrm_policy_afinfo >>> with rcu ) >>> >>> 1) We properly use RCU variant in this file, not a mix of RCU/RCU_BH >>> >>> 2) We must defer some writes after the synchronize_rcu() call or a reader >>> can crash dereferencing NULL pointer. >> >> Not exactly. >> >> net/ipv4/xfrm4_policy.c >> static void __exit xfrm4_policy_fini(void) >> -> xfrm_policy_unregister_afinfo >> >> IMHO, ip stack can never be compiled as module, so is xfrm4_policy_fini >> freed up after system bootup? which means xfrm4_policy_fini can never be >> called. >> >> so an dereferencing NULL pointer by a reader could not happen. >> > > Last famous words. > > Anyway xfrm_policy_unregister_afinfo() is also called from > xfrm6_policy_fini(), and IPv6 is a module. The day we can rmmod it, > we uncover this bug. > > RCU is complex (most people dont get it right, thats the truth), > and we should make it rock solid, or I can guarantee you > many patch attempts from future readers of this code. > > You wont tell them : > > "OK but dont worry we never call this function for real, why do you care > at all" > You are correct! And one out of topic question: The usage of xfrm_state_afinfo_lock/xfrm_km_lock is extremely similar with xfrm_policy_afinfo_lock, except the former is not so frequently read than that of the later. Is it justified to convert RW xfrm_state_afinfo_lock/xfrm_km_lock into RCU? >>> >>> 3) Now we use the xfrm_policy_afinfo_lock spinlock only from process >>> context, we no longer need to block BH in xfrm_policy_register_afinfo() >>> and xfrm_policy_unregister_afinfo() >>> >> I don't think it's related to what kinds of locks we are using. >> we call xfrm_policy_register_afinfo in process context, but actually >> what xfrm_policy_afinfo_lock protected can be used in soft irq context. >> that's why xx_bh is used in: > > You did an RCU conversion and obviously have little idea of what > happened there. > > This _bh stuff was needed because _before_ RCU, an rwlock was used. > > And since read_lock() was used from BH handler, _all_ write_lock() had > to use the write_lock_bh() variant to avoid a possible deadlock. > > But after RCU, this no longer is needed, as an rcu_read_lock() cannot > block a writer anymore in the lock/unlock section. > > In fact, xfrm_policy_afinfo_lock could be replaced by a mutex. So _bh() > is absolutely not needed anymore. > I indeed misunderstood the code a bit. Your explanation is crystal clear, thanks :) > >
On Mon, 2012-08-20 at 14:33 +0800, Fan Du wrote: > And one out of topic question: > The usage of xfrm_state_afinfo_lock/xfrm_km_lock is extremely > similar with xfrm_policy_afinfo_lock, except the former is not so > frequently read than that of the later. > > Is it justified to convert RW xfrm_state_afinfo_lock/xfrm_km_lock into > RCU? > I would say it is justified, and easy enough. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Eric Dumazet <eric.dumazet@gmail.com> Date: Sun, 19 Aug 2012 12:31:48 +0200 > From: Eric Dumazet <edumazet@google.com> > > This patch reverts commit 56892261ed1a (xfrm: Use rcu_dereference_bh to > deference pointer protected by rcu_read_lock_bh), and fixes bugs > introduced in commit 418a99ac6ad ( Replace rwlock on xfrm_policy_afinfo > with rcu ) > > 1) We properly use RCU variant in this file, not a mix of RCU/RCU_BH > > 2) We must defer some writes after the synchronize_rcu() call or a reader > can crash dereferencing NULL pointer. > > 3) Now we use the xfrm_policy_afinfo_lock spinlock only from process > context, we no longer need to block BH in xfrm_policy_register_afinfo() > and xfrm_policy_unregister_afinfo() > > 4) Can use RCU_INIT_POINTER() instead of rcu_assign_pointer() in > xfrm_policy_unregister_afinfo() > > 5) Remove a forward inline declaration (xfrm_policy_put_afinfo()), > and also move xfrm_policy_get_afinfo() declaration. > > Signed-off-by: Eric Dumazet <edumazet@google.com> Applied, thanks Eric. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 6405764..e52f50f 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -48,8 +48,6 @@ static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO] static struct kmem_cache *xfrm_dst_cache __read_mostly; -static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); -static inline void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); static void xfrm_init_pmtu(struct dst_entry *dst); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); @@ -96,6 +94,24 @@ bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl return false; } +static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) +{ + struct xfrm_policy_afinfo *afinfo; + + if (unlikely(family >= NPROTO)) + return NULL; + rcu_read_lock(); + afinfo = rcu_dereference(xfrm_policy_afinfo[family]); + if (unlikely(!afinfo)) + rcu_read_unlock(); + return afinfo; +} + +static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) +{ + rcu_read_unlock(); +} + static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, const xfrm_address_t *saddr, const xfrm_address_t *daddr, @@ -2419,7 +2435,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) return -EINVAL; if (unlikely(afinfo->family >= NPROTO)) return -EAFNOSUPPORT; - spin_lock_bh(&xfrm_policy_afinfo_lock); + spin_lock(&xfrm_policy_afinfo_lock); if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL)) err = -ENOBUFS; else { @@ -2442,7 +2458,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) afinfo->garbage_collect = xfrm_garbage_collect_deferred; rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo); } - spin_unlock_bh(&xfrm_policy_afinfo_lock); + spin_unlock(&xfrm_policy_afinfo_lock); rtnl_lock(); for_each_net(net) { @@ -2475,23 +2491,26 @@ int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) return -EINVAL; if (unlikely(afinfo->family >= NPROTO)) return -EAFNOSUPPORT; - spin_lock_bh(&xfrm_policy_afinfo_lock); + spin_lock(&xfrm_policy_afinfo_lock); if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) { if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo)) err = -EINVAL; - else { - struct dst_ops *dst_ops = afinfo->dst_ops; - rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], - NULL); - dst_ops->kmem_cachep = NULL; - dst_ops->check = NULL; - dst_ops->negative_advice = NULL; - dst_ops->link_failure = NULL; - afinfo->garbage_collect = NULL; - } + else + RCU_INIT_POINTER(xfrm_policy_afinfo[afinfo->family], + NULL); + } + spin_unlock(&xfrm_policy_afinfo_lock); + if (!err) { + struct dst_ops *dst_ops = afinfo->dst_ops; + + synchronize_rcu(); + + dst_ops->kmem_cachep = NULL; + dst_ops->check = NULL; + dst_ops->negative_advice = NULL; + dst_ops->link_failure = NULL; + afinfo->garbage_collect = NULL; } - spin_unlock_bh(&xfrm_policy_afinfo_lock); - synchronize_rcu(); return err; } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); @@ -2500,32 +2519,15 @@ static void __net_init xfrm_dst_ops_init(struct net *net) { struct xfrm_policy_afinfo *afinfo; - rcu_read_lock_bh(); - afinfo = rcu_dereference_bh(xfrm_policy_afinfo[AF_INET]); + rcu_read_lock(); + afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET]); if (afinfo) net->xfrm.xfrm4_dst_ops = *afinfo->dst_ops; #if IS_ENABLED(CONFIG_IPV6) - afinfo = rcu_dereference_bh(xfrm_policy_afinfo[AF_INET6]); + afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET6]); if (afinfo) net->xfrm.xfrm6_dst_ops = *afinfo->dst_ops; #endif - rcu_read_unlock_bh(); -} - -static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) -{ - struct xfrm_policy_afinfo *afinfo; - if (unlikely(family >= NPROTO)) - return NULL; - rcu_read_lock(); - afinfo = rcu_dereference(xfrm_policy_afinfo[family]); - if (unlikely(!afinfo)) - rcu_read_unlock(); - return afinfo; -} - -static inline void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) -{ rcu_read_unlock(); }