Message ID | 20140110063638.GA17866@order.stressinduktion.org |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
On Fri, 2014-01-10 at 07:36 +0100, Hannes Frederic Sowa wrote: > Ok, so I am proposing this patch. Only difference from the RFC is that > I removed the superfluous arg.rule NULL-pointer checks (I hate if they > are superfluous and they always seem to spread ;) ). > > Maybe you could test this one instead and David could pick it up as soon > as your results are in. > > I'll also look for the stable kernels where FIB_LOOKUP_NOREF is not > yet available. > > Thank you, > > Hannes > > [PATCH net] net: avoid reference counter overflows on fib_rules in multicast forwarding > > When introducing multiple table support for multicast forwarding in > IPv4 and IPv6, necessary fib_rules_put reference count decrements were > forgotten. > > Bob Falken reported that after 4G packets, multicast forwarding stopped > working. This was because of a rule reference counter overflow which > freed the rule as soon as the overflow happend. > > So, use FIB_LOOKUP_NOREF if we are already in a RCU protected section and > correctly deal with reference counter if not (called from ndo_start_xmit). > > Fixes: f0ad0860d01e47 ("ipv4: ipmr: support multiple tables") > Fixes: d1db275dd3f6e4 ("ipv6: ip6mr: support multiple tables") > Reported-by: Bob Falken <NetFestivalHaveFun@gmx.com> > Cc: Patrick McHardy <kaber@trash.net> > Cc: Thomas Graf <tgraf@suug.ch> > Cc: Julian Anastasov <ja@ssi.bg> > Cc: Eric Dumazet <eric.dumazet@gmail.com> > Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org> > --- > net/ipv4/ipmr.c | 23 +++++++++++++++++------ > net/ipv6/ip6mr.c | 21 +++++++++++++++------ > 2 files changed, 32 insertions(+), 12 deletions(-) > > diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c > index 421a249..c8d0857 100644 > --- a/net/ipv4/ipmr.c > +++ b/net/ipv4/ipmr.c > @@ -157,9 +157,12 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) > static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, > struct mr_table **mrt) > { > - struct ipmr_result res; > - struct fib_lookup_arg arg = { .result = &res, }; > int err; > + struct ipmr_result res; > + struct fib_lookup_arg arg = { > + .result = &res, > + .flags = FIB_LOOKUP_NOREF, > + }; > > err = fib_rules_lookup(net->ipv4.mr_rules_ops, > flowi4_to_flowi(flp4), 0, &arg); > @@ -448,16 +451,22 @@ failure: > > static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) > { > + int err; > + struct ipmr_result res; > struct net *net = dev_net(dev); > - struct mr_table *mrt; > + > + struct fib_lookup_arg arg = { > + .result = &res, > + }; > + > struct flowi4 fl4 = { > .flowi4_oif = dev->ifindex, > .flowi4_iif = skb->skb_iif, > .flowi4_mark = skb->mark, > }; > - int err; > > - err = ipmr_fib_lookup(net, &fl4, &mrt); > + err = fib_rules_lookup(net->ipv4.mr_rules_ops, > + flowi4_to_flowi(&fl4), 0, &arg); Its not clear to me why you expand ipmr_fib_lookup() Is there something wrong with existing code ? Its not mentioned in changelog > if (err < 0) { > kfree_skb(skb); > return err; > @@ -466,9 +475,11 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) > read_lock(&mrt_lock); > dev->stats.tx_bytes += skb->len; > dev->stats.tx_packets++; > - ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT); > + ipmr_cache_report(res.mrt, skb, res.mrt->mroute_reg_vif_num, > + IGMPMSG_WHOLEPKT); > read_unlock(&mrt_lock); > kfree_skb(skb); > + fib_rule_put(arg.rule); This is the one line that is really missing, patch could be smaller. > return NETDEV_TX_OK; > } > > diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c > index f365310..38347a3 100644 > --- a/net/ipv6/ip6mr.c > +++ b/net/ipv6/ip6mr.c > @@ -141,9 +141,12 @@ static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) > static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, > struct mr6_table **mrt) > { > - struct ip6mr_result res; > - struct fib_lookup_arg arg = { .result = &res, }; > int err; > + struct ip6mr_result res; > + struct fib_lookup_arg arg = { > + .result = &res, > + .flags = FIB_LOOKUP_NOREF, > + }; > > err = fib_rules_lookup(net->ipv6.mr6_rules_ops, > flowi6_to_flowi(flp6), 0, &arg); > @@ -693,16 +696,20 @@ static const struct inet6_protocol pim6_protocol = { > static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, > struct net_device *dev) > { > + int err; > + struct ip6mr_result res; > struct net *net = dev_net(dev); > - struct mr6_table *mrt; > struct flowi6 fl6 = { > .flowi6_oif = dev->ifindex, > .flowi6_iif = skb->skb_iif, > .flowi6_mark = skb->mark, > }; > - int err; > + struct fib_lookup_arg arg = { > + .result = &res, > + }; > > - err = ip6mr_fib_lookup(net, &fl6, &mrt); > + err = fib_rules_lookup(net->ipv6.mr6_rules_ops, > + flowi6_to_flowi(&fl6), 0, &arg); same remark here. > if (err < 0) { > kfree_skb(skb); > return err; > @@ -711,9 +718,11 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, > read_lock(&mrt_lock); > dev->stats.tx_bytes += skb->len; > dev->stats.tx_packets++; > - ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT); > + ip6mr_cache_report(res.mrt, skb, res.mrt->mroute_reg_vif_num, > + MRT6MSG_WHOLEPKT); > read_unlock(&mrt_lock); > kfree_skb(skb); > + fib_rule_put(arg.rule); > return NETDEV_TX_OK; > } > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 09, 2014 at 11:01:46PM -0800, Eric Dumazet wrote: > Its not clear to me why you expand ipmr_fib_lookup() > > Is there something wrong with existing code ? There are three users of ipmr_fib_lookup, two of them are in rcu_read_lock section, one is not. ipmr_fib_lookup does not pass down arg.rule reference, so I don't have a chance to call fib_rule_put(arg.rule) on it. Thus I left ipmr_fib_lookup, just adding FIB_LOOKUP_NOREF and expanding ipmr_fib_lookup into the other function so I still have access to arg.rule to decrement the reference counter. Do you agree? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2014-01-10 at 08:10 +0100, Hannes Frederic Sowa wrote: > On Thu, Jan 09, 2014 at 11:01:46PM -0800, Eric Dumazet wrote: > > Its not clear to me why you expand ipmr_fib_lookup() > > > > Is there something wrong with existing code ? > > There are three users of ipmr_fib_lookup, two of them are in rcu_read_lock > section, one is not. > > ipmr_fib_lookup does not pass down arg.rule reference, so I don't have a > chance to call fib_rule_put(arg.rule) on it. Thus I left ipmr_fib_lookup, > just adding FIB_LOOKUP_NOREF and expanding ipmr_fib_lookup into the > other function so I still have access to arg.rule to decrement the > reference counter. > > Do you agree? Hmm, I see the problem now. What about adding a parameter to ipmr_fib_lookup(), to keep its spirit ? ipmr_fib_lookup(net, &fl4, &mrt); -> ipmr_fib_lookup(net, &fl4, &mrt, &rule); Since ipmr_rt_fib_lookup() has the same rule leak, no ? Its a bit late here, so maybe following is just stupid : Cant we do the fib_rule_put() inside ipmr_fib_lookup() ? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 09, 2014 at 11:32:59PM -0800, Eric Dumazet wrote: > On Fri, 2014-01-10 at 08:10 +0100, Hannes Frederic Sowa wrote: > > On Thu, Jan 09, 2014 at 11:01:46PM -0800, Eric Dumazet wrote: > > > Its not clear to me why you expand ipmr_fib_lookup() > > > > > > Is there something wrong with existing code ? > > > > There are three users of ipmr_fib_lookup, two of them are in rcu_read_lock > > section, one is not. > > > > ipmr_fib_lookup does not pass down arg.rule reference, so I don't have a > > chance to call fib_rule_put(arg.rule) on it. Thus I left ipmr_fib_lookup, > > just adding FIB_LOOKUP_NOREF and expanding ipmr_fib_lookup into the > > other function so I still have access to arg.rule to decrement the > > reference counter. > > > > Do you agree? > > Hmm, I see the problem now. > > What about adding a parameter to ipmr_fib_lookup(), > to keep its spirit ? > > ipmr_fib_lookup(net, &fl4, &mrt); > -> > ipmr_fib_lookup(net, &fl4, &mrt, &rule); > > Since ipmr_rt_fib_lookup() has the same rule leak, no ? No, ipmr_rt_fib_lookup is fine. This function gets called only from rcu read locked section and we don't take table reference because of FIB_LOOKUP_NOREF, so we don't need to put reference counter on arg.table. We could add the additional argument, just ignoring it in ipmr_rt_fib_lookup. > > Its a bit late here, so maybe following is just stupid : > Cant we do the fib_rule_put() inside ipmr_fib_lookup() ? We could add bool noref to ipmr_fib_lookup indicating we want to drop reference to rule just after lookup. I'll check if freeing a rule has additional side-effects on dependencies in reg_vif_xmit. That would be a nice solution actually, thanks! -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 10, 2014 at 08:43:25AM +0100, Hannes Frederic Sowa wrote: > On Thu, Jan 09, 2014 at 11:32:59PM -0800, Eric Dumazet wrote: > > On Fri, 2014-01-10 at 08:10 +0100, Hannes Frederic Sowa wrote: > > > On Thu, Jan 09, 2014 at 11:01:46PM -0800, Eric Dumazet wrote: > > > > Its not clear to me why you expand ipmr_fib_lookup() > > > > > > > > Is there something wrong with existing code ? > > > > > > There are three users of ipmr_fib_lookup, two of them are in rcu_read_lock > > > section, one is not. > > > > > > ipmr_fib_lookup does not pass down arg.rule reference, so I don't have a > > > chance to call fib_rule_put(arg.rule) on it. Thus I left ipmr_fib_lookup, > > > just adding FIB_LOOKUP_NOREF and expanding ipmr_fib_lookup into the > > > other function so I still have access to arg.rule to decrement the > > > reference counter. > > > > > > Do you agree? > > > > Hmm, I see the problem now. > > > > What about adding a parameter to ipmr_fib_lookup(), > > to keep its spirit ? > > > > ipmr_fib_lookup(net, &fl4, &mrt); > > -> > > ipmr_fib_lookup(net, &fl4, &mrt, &rule); > > > > Since ipmr_rt_fib_lookup() has the same rule leak, no ? > > No, ipmr_rt_fib_lookup is fine. This function gets called only from > rcu read locked section and we don't take table reference because of > FIB_LOOKUP_NOREF, so we don't need to put reference counter on arg.table. arg.rule not table, actually. > We could add the additional argument, just ignoring it in ipmr_rt_fib_lookup. > > > > > Its a bit late here, so maybe following is just stupid : > > Cant we do the fib_rule_put() inside ipmr_fib_lookup() ? > > We could add bool noref to ipmr_fib_lookup indicating we want to drop > reference to rule just after lookup. > > I'll check if freeing a rule has additional side-effects on dependencies > in reg_vif_xmit. That would be a nice solution actually, thanks! Hmm, rule holds a reference to the net namespace in use. I don't know if we want to add this special case. I guess net-namespace reference cannot be removed while processing ndo_start_xmit callback but I don't like this special case somehow. But I guess it is possible. Your opinion on that? Thanks, Hannes -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 10, 2014 at 08:50:05AM +0100, Hannes Frederic Sowa wrote: > > > Its a bit late here, so maybe following is just stupid : > > > Cant we do the fib_rule_put() inside ipmr_fib_lookup() ? > > > > We could add bool noref to ipmr_fib_lookup indicating we want to drop > > reference to rule just after lookup. > > > > I'll check if freeing a rule has additional side-effects on dependencies > > in reg_vif_xmit. That would be a nice solution actually, thanks! > > Hmm, rule holds a reference to the net namespace in use. I don't know > if we want to add this special case. I guess net-namespace reference > cannot be removed while processing ndo_start_xmit callback but I don't > like this special case somehow. But I guess it is possible. > > Your opinion on that? Hm, Eric. If we do that we can just specifiy FIB_LOOKUP_NOREF unconditionally. FIB_LOOKUP_NOREF has no other side effects on a ipmr lookup as taking the reference on the rule, which we would drop after that. So we would actually be going back to the first patch in this thread. I guess it is just a matter of style? Greetings, Hannes -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2014-01-12 at 08:42 +0100, Hannes Frederic Sowa wrote: > Hm, Eric. If we do that we can just specifiy FIB_LOOKUP_NOREF > unconditionally. FIB_LOOKUP_NOREF has no other side effects on a ipmr > lookup as taking the reference on the rule, which we would drop after > that. > > So we would actually be going back to the first patch in this thread. I > guess it is just a matter of style? Hi Hannes, please submit a formal patch, so that we can have a proper ground for discussion (I guess I'll only add my Acked-by) Thanks ! -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 421a249..c8d0857 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -157,9 +157,12 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { - struct ipmr_result res; - struct fib_lookup_arg arg = { .result = &res, }; int err; + struct ipmr_result res; + struct fib_lookup_arg arg = { + .result = &res, + .flags = FIB_LOOKUP_NOREF, + }; err = fib_rules_lookup(net->ipv4.mr_rules_ops, flowi4_to_flowi(flp4), 0, &arg); @@ -448,16 +451,22 @@ failure: static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { + int err; + struct ipmr_result res; struct net *net = dev_net(dev); - struct mr_table *mrt; + + struct fib_lookup_arg arg = { + .result = &res, + }; + struct flowi4 fl4 = { .flowi4_oif = dev->ifindex, .flowi4_iif = skb->skb_iif, .flowi4_mark = skb->mark, }; - int err; - err = ipmr_fib_lookup(net, &fl4, &mrt); + err = fib_rules_lookup(net->ipv4.mr_rules_ops, + flowi4_to_flowi(&fl4), 0, &arg); if (err < 0) { kfree_skb(skb); return err; @@ -466,9 +475,11 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; dev->stats.tx_packets++; - ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT); + ipmr_cache_report(res.mrt, skb, res.mrt->mroute_reg_vif_num, + IGMPMSG_WHOLEPKT); read_unlock(&mrt_lock); kfree_skb(skb); + fib_rule_put(arg.rule); return NETDEV_TX_OK; } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index f365310..38347a3 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -141,9 +141,12 @@ static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr6_table **mrt) { - struct ip6mr_result res; - struct fib_lookup_arg arg = { .result = &res, }; int err; + struct ip6mr_result res; + struct fib_lookup_arg arg = { + .result = &res, + .flags = FIB_LOOKUP_NOREF, + }; err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, &arg); @@ -693,16 +696,20 @@ static const struct inet6_protocol pim6_protocol = { static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { + int err; + struct ip6mr_result res; struct net *net = dev_net(dev); - struct mr6_table *mrt; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .flowi6_iif = skb->skb_iif, .flowi6_mark = skb->mark, }; - int err; + struct fib_lookup_arg arg = { + .result = &res, + }; - err = ip6mr_fib_lookup(net, &fl6, &mrt); + err = fib_rules_lookup(net->ipv6.mr6_rules_ops, + flowi6_to_flowi(&fl6), 0, &arg); if (err < 0) { kfree_skb(skb); return err; @@ -711,9 +718,11 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; dev->stats.tx_packets++; - ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT); + ip6mr_cache_report(res.mrt, skb, res.mrt->mroute_reg_vif_num, + MRT6MSG_WHOLEPKT); read_unlock(&mrt_lock); kfree_skb(skb); + fib_rule_put(arg.rule); return NETDEV_TX_OK; }