diff mbox

Multicast routing stops functioning after 4G multicast packets recived.

Message ID 20140110063638.GA17866@order.stressinduktion.org
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Hannes Frederic Sowa Jan. 10, 2014, 6:36 a.m. UTC
On Thu, Jan 09, 2014 at 09:14:11PM +0100, Bob Falken wrote:
> Hello,
> Testing this patch as im typing this. will check status in about 12hours.
> Unfortuantly, I dont have any receivers avaialble for requesting the multicast stream on the edge point anymore. 
> So there is not TX traffic a.t.m..
> 
> I will have a better test-lab available next week. (hopefully).

Ok, so I am proposing this patch. Only difference from the RFC is that
I removed the superfluous arg.rule NULL-pointer checks (I hate if they
are superfluous and they always seem to spread ;) ).

Maybe you could test this one instead and David could pick it up as soon
as your results are in.

I'll also look for the stable kernels where FIB_LOOKUP_NOREF is not
yet available.

Thank you,

  Hannes

[PATCH net] net: avoid reference counter overflows on fib_rules in multicast forwarding

When introducing multiple table support for multicast forwarding in
IPv4 and IPv6, necessary fib_rules_put reference count decrements were
forgotten.

Bob Falken reported that after 4G packets, multicast forwarding stopped
working. This was because of a rule reference counter overflow which
freed the rule as soon as the overflow happend.

So, use FIB_LOOKUP_NOREF if we are already in a RCU protected section and
correctly deal with reference counter if not (called from ndo_start_xmit).

Fixes: f0ad0860d01e47 ("ipv4: ipmr: support multiple tables")
Fixes: d1db275dd3f6e4 ("ipv6: ip6mr: support multiple tables")
Reported-by: Bob Falken <NetFestivalHaveFun@gmx.com>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Julian Anastasov <ja@ssi.bg>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
 net/ipv4/ipmr.c  | 23 +++++++++++++++++------
 net/ipv6/ip6mr.c | 21 +++++++++++++++------
 2 files changed, 32 insertions(+), 12 deletions(-)

Comments

Eric Dumazet Jan. 10, 2014, 7:01 a.m. UTC | #1
On Fri, 2014-01-10 at 07:36 +0100, Hannes Frederic Sowa wrote:

> Ok, so I am proposing this patch. Only difference from the RFC is that
> I removed the superfluous arg.rule NULL-pointer checks (I hate if they
> are superfluous and they always seem to spread ;) ).
> 
> Maybe you could test this one instead and David could pick it up as soon
> as your results are in.
> 
> I'll also look for the stable kernels where FIB_LOOKUP_NOREF is not
> yet available.
> 
> Thank you,
> 
>   Hannes
> 
> [PATCH net] net: avoid reference counter overflows on fib_rules in multicast forwarding
> 
> When introducing multiple table support for multicast forwarding in
> IPv4 and IPv6, necessary fib_rules_put reference count decrements were
> forgotten.
> 
> Bob Falken reported that after 4G packets, multicast forwarding stopped
> working. This was because of a rule reference counter overflow which
> freed the rule as soon as the overflow happend.
> 
> So, use FIB_LOOKUP_NOREF if we are already in a RCU protected section and
> correctly deal with reference counter if not (called from ndo_start_xmit).
> 
> Fixes: f0ad0860d01e47 ("ipv4: ipmr: support multiple tables")
> Fixes: d1db275dd3f6e4 ("ipv6: ip6mr: support multiple tables")
> Reported-by: Bob Falken <NetFestivalHaveFun@gmx.com>
> Cc: Patrick McHardy <kaber@trash.net>
> Cc: Thomas Graf <tgraf@suug.ch>
> Cc: Julian Anastasov <ja@ssi.bg>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
> ---
>  net/ipv4/ipmr.c  | 23 +++++++++++++++++------
>  net/ipv6/ip6mr.c | 21 +++++++++++++++------
>  2 files changed, 32 insertions(+), 12 deletions(-)
> 
> diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
> index 421a249..c8d0857 100644
> --- a/net/ipv4/ipmr.c
> +++ b/net/ipv4/ipmr.c
> @@ -157,9 +157,12 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
>  static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
>  			   struct mr_table **mrt)
>  {
> -	struct ipmr_result res;
> -	struct fib_lookup_arg arg = { .result = &res, };
>  	int err;
> +	struct ipmr_result res;
> +	struct fib_lookup_arg arg = {
> +		.result = &res,
> +		.flags = FIB_LOOKUP_NOREF,
> +	};
>  
>  	err = fib_rules_lookup(net->ipv4.mr_rules_ops,
>  			       flowi4_to_flowi(flp4), 0, &arg);
> @@ -448,16 +451,22 @@ failure:
>  
>  static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
> +	int err;
> +	struct ipmr_result res;
>  	struct net *net = dev_net(dev);
> -	struct mr_table *mrt;
> +
> +	struct fib_lookup_arg arg = {
> +		.result = &res,
> +	};
> +
>  	struct flowi4 fl4 = {
>  		.flowi4_oif	= dev->ifindex,
>  		.flowi4_iif	= skb->skb_iif,
>  		.flowi4_mark	= skb->mark,
>  	};
> -	int err;
>  
> -	err = ipmr_fib_lookup(net, &fl4, &mrt);
> +	err = fib_rules_lookup(net->ipv4.mr_rules_ops,
> +			       flowi4_to_flowi(&fl4), 0, &arg);

Its not clear to me why you expand ipmr_fib_lookup()

Is there something wrong with existing code ?

Its not mentioned in changelog

>  	if (err < 0) {
>  		kfree_skb(skb);
>  		return err;
> @@ -466,9 +475,11 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
>  	read_lock(&mrt_lock);
>  	dev->stats.tx_bytes += skb->len;
>  	dev->stats.tx_packets++;
> -	ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
> +	ipmr_cache_report(res.mrt, skb, res.mrt->mroute_reg_vif_num,
> +			  IGMPMSG_WHOLEPKT);
>  	read_unlock(&mrt_lock);
>  	kfree_skb(skb);
> +	fib_rule_put(arg.rule);

This is the one line that is really missing, patch could be smaller.

>  	return NETDEV_TX_OK;
>  }
>  
> diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
> index f365310..38347a3 100644
> --- a/net/ipv6/ip6mr.c
> +++ b/net/ipv6/ip6mr.c
> @@ -141,9 +141,12 @@ static struct mr6_table *ip6mr_get_table(struct net *net, u32 id)
>  static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
>  			    struct mr6_table **mrt)
>  {
> -	struct ip6mr_result res;
> -	struct fib_lookup_arg arg = { .result = &res, };
>  	int err;
> +	struct ip6mr_result res;
> +	struct fib_lookup_arg arg = {
> +		.result = &res,
> +		.flags = FIB_LOOKUP_NOREF,
> +	};
>  
>  	err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
>  			       flowi6_to_flowi(flp6), 0, &arg);
> @@ -693,16 +696,20 @@ static const struct inet6_protocol pim6_protocol = {
>  static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
>  				      struct net_device *dev)
>  {
> +	int err;
> +	struct ip6mr_result res;
>  	struct net *net = dev_net(dev);
> -	struct mr6_table *mrt;
>  	struct flowi6 fl6 = {
>  		.flowi6_oif	= dev->ifindex,
>  		.flowi6_iif	= skb->skb_iif,
>  		.flowi6_mark	= skb->mark,
>  	};
> -	int err;
> +	struct fib_lookup_arg arg = {
> +		.result = &res,
> +	};
>  
> -	err = ip6mr_fib_lookup(net, &fl6, &mrt);
> +	err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
> +			flowi6_to_flowi(&fl6), 0, &arg);


same remark here.

>  	if (err < 0) {
>  		kfree_skb(skb);
>  		return err;
> @@ -711,9 +718,11 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
>  	read_lock(&mrt_lock);
>  	dev->stats.tx_bytes += skb->len;
>  	dev->stats.tx_packets++;
> -	ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT);
> +	ip6mr_cache_report(res.mrt, skb, res.mrt->mroute_reg_vif_num,
> +			   MRT6MSG_WHOLEPKT);
>  	read_unlock(&mrt_lock);
>  	kfree_skb(skb);
> +	fib_rule_put(arg.rule);
>  	return NETDEV_TX_OK;
>  }
>  


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Frederic Sowa Jan. 10, 2014, 7:10 a.m. UTC | #2
On Thu, Jan 09, 2014 at 11:01:46PM -0800, Eric Dumazet wrote:
> Its not clear to me why you expand ipmr_fib_lookup()
> 
> Is there something wrong with existing code ?

There are three users of ipmr_fib_lookup, two of them are in rcu_read_lock
section, one is not.

ipmr_fib_lookup does not pass down arg.rule reference, so I don't have a
chance to call fib_rule_put(arg.rule) on it. Thus I left ipmr_fib_lookup,
just adding FIB_LOOKUP_NOREF and expanding ipmr_fib_lookup into the
other function so I still have access to arg.rule to decrement the
reference counter.

Do you agree?

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Jan. 10, 2014, 7:32 a.m. UTC | #3
On Fri, 2014-01-10 at 08:10 +0100, Hannes Frederic Sowa wrote:
> On Thu, Jan 09, 2014 at 11:01:46PM -0800, Eric Dumazet wrote:
> > Its not clear to me why you expand ipmr_fib_lookup()
> > 
> > Is there something wrong with existing code ?
> 
> There are three users of ipmr_fib_lookup, two of them are in rcu_read_lock
> section, one is not.
> 
> ipmr_fib_lookup does not pass down arg.rule reference, so I don't have a
> chance to call fib_rule_put(arg.rule) on it. Thus I left ipmr_fib_lookup,
> just adding FIB_LOOKUP_NOREF and expanding ipmr_fib_lookup into the
> other function so I still have access to arg.rule to decrement the
> reference counter.
> 
> Do you agree?

Hmm, I see the problem now.

What about adding a parameter to ipmr_fib_lookup(),
to keep its spirit ?

ipmr_fib_lookup(net, &fl4, &mrt);
->
ipmr_fib_lookup(net, &fl4, &mrt, &rule);

Since ipmr_rt_fib_lookup() has the same rule leak, no ?

Its a bit late here, so maybe following is just stupid :
Cant we do the fib_rule_put() inside ipmr_fib_lookup() ?



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Frederic Sowa Jan. 10, 2014, 7:43 a.m. UTC | #4
On Thu, Jan 09, 2014 at 11:32:59PM -0800, Eric Dumazet wrote:
> On Fri, 2014-01-10 at 08:10 +0100, Hannes Frederic Sowa wrote:
> > On Thu, Jan 09, 2014 at 11:01:46PM -0800, Eric Dumazet wrote:
> > > Its not clear to me why you expand ipmr_fib_lookup()
> > > 
> > > Is there something wrong with existing code ?
> > 
> > There are three users of ipmr_fib_lookup, two of them are in rcu_read_lock
> > section, one is not.
> > 
> > ipmr_fib_lookup does not pass down arg.rule reference, so I don't have a
> > chance to call fib_rule_put(arg.rule) on it. Thus I left ipmr_fib_lookup,
> > just adding FIB_LOOKUP_NOREF and expanding ipmr_fib_lookup into the
> > other function so I still have access to arg.rule to decrement the
> > reference counter.
> > 
> > Do you agree?
> 
> Hmm, I see the problem now.
> 
> What about adding a parameter to ipmr_fib_lookup(),
> to keep its spirit ?
> 
> ipmr_fib_lookup(net, &fl4, &mrt);
> ->
> ipmr_fib_lookup(net, &fl4, &mrt, &rule);
> 
> Since ipmr_rt_fib_lookup() has the same rule leak, no ?

No, ipmr_rt_fib_lookup is fine. This function gets called only from
rcu read locked section and we don't take table reference because of
FIB_LOOKUP_NOREF, so we don't need to put reference counter on arg.table.

We could add the additional argument, just ignoring it in ipmr_rt_fib_lookup.

> 
> Its a bit late here, so maybe following is just stupid :
> Cant we do the fib_rule_put() inside ipmr_fib_lookup() ?

We could add bool noref to ipmr_fib_lookup indicating we want to drop
reference to rule just after lookup.

I'll check if freeing a rule has additional side-effects on dependencies
in reg_vif_xmit. That would be a nice solution actually, thanks!

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Frederic Sowa Jan. 10, 2014, 7:50 a.m. UTC | #5
On Fri, Jan 10, 2014 at 08:43:25AM +0100, Hannes Frederic Sowa wrote:
> On Thu, Jan 09, 2014 at 11:32:59PM -0800, Eric Dumazet wrote:
> > On Fri, 2014-01-10 at 08:10 +0100, Hannes Frederic Sowa wrote:
> > > On Thu, Jan 09, 2014 at 11:01:46PM -0800, Eric Dumazet wrote:
> > > > Its not clear to me why you expand ipmr_fib_lookup()
> > > > 
> > > > Is there something wrong with existing code ?
> > > 
> > > There are three users of ipmr_fib_lookup, two of them are in rcu_read_lock
> > > section, one is not.
> > > 
> > > ipmr_fib_lookup does not pass down arg.rule reference, so I don't have a
> > > chance to call fib_rule_put(arg.rule) on it. Thus I left ipmr_fib_lookup,
> > > just adding FIB_LOOKUP_NOREF and expanding ipmr_fib_lookup into the
> > > other function so I still have access to arg.rule to decrement the
> > > reference counter.
> > > 
> > > Do you agree?
> > 
> > Hmm, I see the problem now.
> > 
> > What about adding a parameter to ipmr_fib_lookup(),
> > to keep its spirit ?
> > 
> > ipmr_fib_lookup(net, &fl4, &mrt);
> > ->
> > ipmr_fib_lookup(net, &fl4, &mrt, &rule);
> > 
> > Since ipmr_rt_fib_lookup() has the same rule leak, no ?
> 
> No, ipmr_rt_fib_lookup is fine. This function gets called only from
> rcu read locked section and we don't take table reference because of
> FIB_LOOKUP_NOREF, so we don't need to put reference counter on arg.table.

arg.rule not table, actually.

> We could add the additional argument, just ignoring it in ipmr_rt_fib_lookup.
> 
> > 
> > Its a bit late here, so maybe following is just stupid :
> > Cant we do the fib_rule_put() inside ipmr_fib_lookup() ?
> 
> We could add bool noref to ipmr_fib_lookup indicating we want to drop
> reference to rule just after lookup.
> 
> I'll check if freeing a rule has additional side-effects on dependencies
> in reg_vif_xmit. That would be a nice solution actually, thanks!

Hmm, rule holds a reference to the net namespace in use. I don't know
if we want to add this special case. I guess net-namespace reference
cannot be removed while processing ndo_start_xmit callback but I don't
like this special case somehow. But I guess it is possible.

Your opinion on that?

Thanks,

  Hannes
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Frederic Sowa Jan. 12, 2014, 7:42 a.m. UTC | #6
On Fri, Jan 10, 2014 at 08:50:05AM +0100, Hannes Frederic Sowa wrote:
> > > Its a bit late here, so maybe following is just stupid :
> > > Cant we do the fib_rule_put() inside ipmr_fib_lookup() ?
> > 
> > We could add bool noref to ipmr_fib_lookup indicating we want to drop
> > reference to rule just after lookup.
> > 
> > I'll check if freeing a rule has additional side-effects on dependencies
> > in reg_vif_xmit. That would be a nice solution actually, thanks!
> 
> Hmm, rule holds a reference to the net namespace in use. I don't know
> if we want to add this special case. I guess net-namespace reference
> cannot be removed while processing ndo_start_xmit callback but I don't
> like this special case somehow. But I guess it is possible.
> 
> Your opinion on that?

Hm, Eric. If we do that we can just specifiy FIB_LOOKUP_NOREF
unconditionally. FIB_LOOKUP_NOREF has no other side effects on a ipmr
lookup as taking the reference on the rule, which we would drop after
that.

So we would actually be going back to the first patch in this thread. I
guess it is just a matter of style?

Greetings,

  Hannes

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Jan. 13, 2014, 12:56 a.m. UTC | #7
On Sun, 2014-01-12 at 08:42 +0100, Hannes Frederic Sowa wrote:

> Hm, Eric. If we do that we can just specifiy FIB_LOOKUP_NOREF
> unconditionally. FIB_LOOKUP_NOREF has no other side effects on a ipmr
> lookup as taking the reference on the rule, which we would drop after
> that.
> 
> So we would actually be going back to the first patch in this thread. I
> guess it is just a matter of style?

Hi Hannes, please submit a formal patch, so that we can have a proper
ground for discussion (I guess I'll only add my Acked-by)

Thanks !


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 421a249..c8d0857 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -157,9 +157,12 @@  static struct mr_table *ipmr_get_table(struct net *net, u32 id)
 static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
 			   struct mr_table **mrt)
 {
-	struct ipmr_result res;
-	struct fib_lookup_arg arg = { .result = &res, };
 	int err;
+	struct ipmr_result res;
+	struct fib_lookup_arg arg = {
+		.result = &res,
+		.flags = FIB_LOOKUP_NOREF,
+	};
 
 	err = fib_rules_lookup(net->ipv4.mr_rules_ops,
 			       flowi4_to_flowi(flp4), 0, &arg);
@@ -448,16 +451,22 @@  failure:
 
 static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+	int err;
+	struct ipmr_result res;
 	struct net *net = dev_net(dev);
-	struct mr_table *mrt;
+
+	struct fib_lookup_arg arg = {
+		.result = &res,
+	};
+
 	struct flowi4 fl4 = {
 		.flowi4_oif	= dev->ifindex,
 		.flowi4_iif	= skb->skb_iif,
 		.flowi4_mark	= skb->mark,
 	};
-	int err;
 
-	err = ipmr_fib_lookup(net, &fl4, &mrt);
+	err = fib_rules_lookup(net->ipv4.mr_rules_ops,
+			       flowi4_to_flowi(&fl4), 0, &arg);
 	if (err < 0) {
 		kfree_skb(skb);
 		return err;
@@ -466,9 +475,11 @@  static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
 	read_lock(&mrt_lock);
 	dev->stats.tx_bytes += skb->len;
 	dev->stats.tx_packets++;
-	ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
+	ipmr_cache_report(res.mrt, skb, res.mrt->mroute_reg_vif_num,
+			  IGMPMSG_WHOLEPKT);
 	read_unlock(&mrt_lock);
 	kfree_skb(skb);
+	fib_rule_put(arg.rule);
 	return NETDEV_TX_OK;
 }
 
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index f365310..38347a3 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -141,9 +141,12 @@  static struct mr6_table *ip6mr_get_table(struct net *net, u32 id)
 static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
 			    struct mr6_table **mrt)
 {
-	struct ip6mr_result res;
-	struct fib_lookup_arg arg = { .result = &res, };
 	int err;
+	struct ip6mr_result res;
+	struct fib_lookup_arg arg = {
+		.result = &res,
+		.flags = FIB_LOOKUP_NOREF,
+	};
 
 	err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
 			       flowi6_to_flowi(flp6), 0, &arg);
@@ -693,16 +696,20 @@  static const struct inet6_protocol pim6_protocol = {
 static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
 				      struct net_device *dev)
 {
+	int err;
+	struct ip6mr_result res;
 	struct net *net = dev_net(dev);
-	struct mr6_table *mrt;
 	struct flowi6 fl6 = {
 		.flowi6_oif	= dev->ifindex,
 		.flowi6_iif	= skb->skb_iif,
 		.flowi6_mark	= skb->mark,
 	};
-	int err;
+	struct fib_lookup_arg arg = {
+		.result = &res,
+	};
 
-	err = ip6mr_fib_lookup(net, &fl6, &mrt);
+	err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
+			flowi6_to_flowi(&fl6), 0, &arg);
 	if (err < 0) {
 		kfree_skb(skb);
 		return err;
@@ -711,9 +718,11 @@  static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
 	read_lock(&mrt_lock);
 	dev->stats.tx_bytes += skb->len;
 	dev->stats.tx_packets++;
-	ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT);
+	ip6mr_cache_report(res.mrt, skb, res.mrt->mroute_reg_vif_num,
+			   MRT6MSG_WHOLEPKT);
 	read_unlock(&mrt_lock);
 	kfree_skb(skb);
+	fib_rule_put(arg.rule);
 	return NETDEV_TX_OK;
 }