diff mbox

BUG ? ipip unregister_netdevice_many()

Message ID 20101012.130520.48517464.davem@davemloft.net
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

David Miller Oct. 12, 2010, 8:05 p.m. UTC
From: ebiederm@xmission.com (Eric W. Biederman)
Date: Fri, 08 Oct 2010 10:32:40 -0700

> It is just dealing with not flushing the entire routing cache, just the
> routes that have expired.  Which prevents one network namespace from
> flushing it's routes and DOS'ing another.

That's a very indirect and obfuscated way of handling it.

And I still don't know why we let the first contiguous set of expired
entries in the chain get freed outside of the lock, and the rest
inside the lock.  That really isn't explained by anything I've read.

How about we just do exactly what's intended, and with no ifdefs?

Signed-off-by: David S. Miller <davem@davemloft.net>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Jarek Poplawski Oct. 13, 2010, 11:19 a.m. UTC | #1
On 2010-10-12 22:05, David Miller wrote:
> From: ebiederm@xmission.com (Eric W. Biederman)
> Date: Fri, 08 Oct 2010 10:32:40 -0700
> 
>> It is just dealing with not flushing the entire routing cache, just the
>> routes that have expired.  Which prevents one network namespace from
>> flushing it's routes and DOS'ing another.
> 
> That's a very indirect and obfuscated way of handling it.
> 
> And I still don't know why we let the first contiguous set of expired
> entries in the chain get freed outside of the lock, and the rest
> inside the lock.  That really isn't explained by anything I've read.
> 
> How about we just do exactly what's intended, and with no ifdefs?
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>
...
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 0755aa4..6ad730c 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -712,13 +712,14 @@ static inline int rt_is_expired(struct rtable *rth)
>   * Can be called by a softirq or a process.
>   * In the later case, we want to be reschedule if necessary
>   */
> -static void rt_do_flush(int process_context)
> +static void rt_do_flush(struct net *net, int process_context)
>  {
>  	unsigned int i;
>  	struct rtable *rth, *next;
> -	struct rtable * tail;
>  
>  	for (i = 0; i <= rt_hash_mask; i++) {
> +		struct rtable *list, **pprev;

Isn't "list = NULL" needed here?

Jarek P.

...
> +				rth->dst.rt_next = list;
> +				list = rth;
> +			} else
> +				pprev = &rth->dst.rt_next;
> +
> +			rth = next;
...
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Oct. 13, 2010, 9:58 p.m. UTC | #2
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Wed, 13 Oct 2010 11:19:47 +0000

>> -static void rt_do_flush(int process_context)
>> +static void rt_do_flush(struct net *net, int process_context)
>>  {
>>  	unsigned int i;
>>  	struct rtable *rth, *next;
>> -	struct rtable * tail;
>>  
>>  	for (i = 0; i <= rt_hash_mask; i++) {
>> +		struct rtable *list, **pprev;
> 
> Isn't "list = NULL" needed here?

Yes it is, thanks for catching that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Lezcano Oct. 13, 2010, 10:16 p.m. UTC | #3
On 10/12/2010 10:05 PM, David Miller wrote:
> From: ebiederm@xmission.com (Eric W. Biederman)
> Date: Fri, 08 Oct 2010 10:32:40 -0700
>
>    
>> It is just dealing with not flushing the entire routing cache, just the
>> routes that have expired.  Which prevents one network namespace from
>> flushing it's routes and DOS'ing another.
>>      
> That's a very indirect and obfuscated way of handling it.
>    

I agree.

> And I still don't know why we let the first contiguous set of expired
> entries in the chain get freed outside of the lock, and the rest
> inside the lock.  That really isn't explained by anything I've read.
>
> How about we just do exactly what's intended, and with no ifdefs?
>    
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>

Dave,

do you mind to wait I test the patch before merging it ?
I would like to stress a bit this routine with multiple containers.

Thanks
    -- Daniel

> Signed-off-by: David S. Miller<davem@davemloft.net>
>
> diff --git a/include/net/route.h b/include/net/route.h
> index 7e5e73b..8d24761 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -106,7 +106,7 @@ extern int		ip_rt_init(void);
>   extern void		ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw,
>   				       __be32 src, struct net_device *dev);
>   extern void		rt_cache_flush(struct net *net, int how);
> -extern void		rt_cache_flush_batch(void);
> +extern void		rt_cache_flush_batch(struct net *net);
>   extern int		__ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp);
>   extern int		ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
>   extern int		ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> index 919f2ad..4039f56 100644
> --- a/net/ipv4/fib_frontend.c
> +++ b/net/ipv4/fib_frontend.c
> @@ -999,7 +999,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
>   		rt_cache_flush(dev_net(dev), 0);
>   		break;
>   	case NETDEV_UNREGISTER_BATCH:
> -		rt_cache_flush_batch();
> +		rt_cache_flush_batch(dev_net(dev));
>   		break;
>   	}
>   	return NOTIFY_DONE;
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 0755aa4..6ad730c 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -712,13 +712,14 @@ static inline int rt_is_expired(struct rtable *rth)
>    * Can be called by a softirq or a process.
>    * In the later case, we want to be reschedule if necessary
>    */
> -static void rt_do_flush(int process_context)
> +static void rt_do_flush(struct net *net, int process_context)
>   {
>   	unsigned int i;
>   	struct rtable *rth, *next;
> -	struct rtable * tail;
>
>   	for (i = 0; i<= rt_hash_mask; i++) {
> +		struct rtable *list, **pprev;
> +
>   		if (process_context&&  need_resched())
>   			cond_resched();
>   		rth = rt_hash_table[i].chain;
> @@ -726,41 +727,27 @@ static void rt_do_flush(int process_context)
>   			continue;
>
>   		spin_lock_bh(rt_hash_lock_addr(i));
> -#ifdef CONFIG_NET_NS
> -		{
> -		struct rtable ** prev, * p;
>
> -		rth = rt_hash_table[i].chain;
> +		pprev =&rt_hash_table[i].chain;
> +		rth = *pprev;
> +		while (rth) {
> +			next = rth->dst.rt_next;
> +			if (dev_net(rth->dst.dev) == net) {
> +				*pprev = next;
>
> -		/* defer releasing the head of the list after spin_unlock */
> -		for (tail = rth; tail; tail = tail->dst.rt_next)
> -			if (!rt_is_expired(tail))
> -				break;
> -		if (rth != tail)
> -			rt_hash_table[i].chain = tail;
> -
> -		/* call rt_free on entries after the tail requiring flush */
> -		prev =&rt_hash_table[i].chain;
> -		for (p = *prev; p; p = next) {
> -			next = p->dst.rt_next;
> -			if (!rt_is_expired(p)) {
> -				prev =&p->dst.rt_next;
> -			} else {
> -				*prev = next;
> -				rt_free(p);
> -			}
> -		}
> +				rth->dst.rt_next = list;
> +				list = rth;
> +			} else
> +				pprev =&rth->dst.rt_next;
> +
> +			rth = next;
>   		}
> -#else
> -		rth = rt_hash_table[i].chain;
> -		rt_hash_table[i].chain = NULL;
> -		tail = NULL;
> -#endif
> +
>   		spin_unlock_bh(rt_hash_lock_addr(i));
>
> -		for (; rth != tail; rth = next) {
> -			next = rth->dst.rt_next;
> -			rt_free(rth);
> +		for (; list; list = next) {
> +			next = list->dst.rt_next;
> +			rt_free(list);
>   		}
>   	}
>   }
> @@ -906,13 +893,13 @@ void rt_cache_flush(struct net *net, int delay)
>   {
>   	rt_cache_invalidate(net);
>   	if (delay>= 0)
> -		rt_do_flush(!in_softirq());
> +		rt_do_flush(net, !in_softirq());
>   }
>
>   /* Flush previous cache invalidated entries from the cache */
> -void rt_cache_flush_batch(void)
> +void rt_cache_flush_batch(struct net *net)
>   {
> -	rt_do_flush(!in_softirq());
> +	rt_do_flush(net, !in_softirq());
>   }
>
>   static void rt_emergency_hash_rebuild(struct net *net)
>
>    

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Oct. 14, 2010, 4:40 a.m. UTC | #4
David Miller <davem@davemloft.net> writes:

> From: ebiederm@xmission.com (Eric W. Biederman)
> Date: Fri, 08 Oct 2010 10:32:40 -0700
>
>> It is just dealing with not flushing the entire routing cache, just the
>> routes that have expired.  Which prevents one network namespace from
>> flushing it's routes and DOS'ing another.
>
> That's a very indirect and obfuscated way of handling it.
>
> And I still don't know why we let the first contiguous set of expired
> entries in the chain get freed outside of the lock, and the rest
> inside the lock.  That really isn't explained by anything I've read.
>
> How about we just do exactly what's intended, and with no ifdefs?

I'm all for no ifdefs.

And reading the code your version looks much simpler and easier
to read and I am all for that.

However I think the test should still be rt_is_expired(), because
that is what rt_do_flush() is doing removing the expired entries
from the list.

The only difference being that we remove the assumption that all hash
table entries must be expired at this point.

We have very straight forwardly expired all of the route table entries
for the namespace that go this going earlier.


Eric


> Signed-off-by: David S. Miller <davem@davemloft.net>
>
> diff --git a/include/net/route.h b/include/net/route.h
> index 7e5e73b..8d24761 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -106,7 +106,7 @@ extern int		ip_rt_init(void);
>  extern void		ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw,
>  				       __be32 src, struct net_device *dev);
>  extern void		rt_cache_flush(struct net *net, int how);
> -extern void		rt_cache_flush_batch(void);
> +extern void		rt_cache_flush_batch(struct net *net);
>  extern int		__ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp);
>  extern int		ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
>  extern int		ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> index 919f2ad..4039f56 100644
> --- a/net/ipv4/fib_frontend.c
> +++ b/net/ipv4/fib_frontend.c
> @@ -999,7 +999,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
>  		rt_cache_flush(dev_net(dev), 0);
>  		break;
>  	case NETDEV_UNREGISTER_BATCH:
> -		rt_cache_flush_batch();
> +		rt_cache_flush_batch(dev_net(dev));

I believe this change is actually wrong.  dev here is the first
element of a list of network devices, and that list may span multiple
network namespaces.

>  		break;
>  	}
>  	return NOTIFY_DONE;
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 0755aa4..6ad730c 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -712,13 +712,14 @@ static inline int rt_is_expired(struct rtable *rth)
>   * Can be called by a softirq or a process.
>   * In the later case, we want to be reschedule if necessary
>   */
> -static void rt_do_flush(int process_context)
> +static void rt_do_flush(struct net *net, int process_context)
>  {
>  	unsigned int i;
>  	struct rtable *rth, *next;
> -	struct rtable * tail;
>  
>  	for (i = 0; i <= rt_hash_mask; i++) {
> +		struct rtable *list, **pprev;
> +
>  		if (process_context && need_resched())
>  			cond_resched();
>  		rth = rt_hash_table[i].chain;
> @@ -726,41 +727,27 @@ static void rt_do_flush(int process_context)
>  			continue;
>  
>  		spin_lock_bh(rt_hash_lock_addr(i));
> -#ifdef CONFIG_NET_NS
> -		{
> -		struct rtable ** prev, * p;
>  
> -		rth = rt_hash_table[i].chain;
> +		pprev = &rt_hash_table[i].chain;
> +		rth = *pprev;
> +		while (rth) {
> +			next = rth->dst.rt_next;
> +			if (dev_net(rth->dst.dev) == net) {
> +				*pprev = next;
>  
> -		/* defer releasing the head of the list after spin_unlock */
> -		for (tail = rth; tail; tail = tail->dst.rt_next)
> -			if (!rt_is_expired(tail))
> -				break;
> -		if (rth != tail)
> -			rt_hash_table[i].chain = tail;
> -
> -		/* call rt_free on entries after the tail requiring flush */
> -		prev = &rt_hash_table[i].chain;
> -		for (p = *prev; p; p = next) {
> -			next = p->dst.rt_next;
> -			if (!rt_is_expired(p)) {
> -				prev = &p->dst.rt_next;
> -			} else {
> -				*prev = next;
> -				rt_free(p);
> -			}
> -		}
> +				rth->dst.rt_next = list;
> +				list = rth;
> +			} else
> +				pprev = &rth->dst.rt_next;
> +
> +			rth = next;
>  		}
> -#else
> -		rth = rt_hash_table[i].chain;
> -		rt_hash_table[i].chain = NULL;
> -		tail = NULL;
> -#endif
> +
>  		spin_unlock_bh(rt_hash_lock_addr(i));
>  
> -		for (; rth != tail; rth = next) {
> -			next = rth->dst.rt_next;
> -			rt_free(rth);
> +		for (; list; list = next) {
> +			next = list->dst.rt_next;
> +			rt_free(list);
>  		}
>  	}
>  }
> @@ -906,13 +893,13 @@ void rt_cache_flush(struct net *net, int delay)
>  {
>  	rt_cache_invalidate(net);
>  	if (delay >= 0)
> -		rt_do_flush(!in_softirq());
> +		rt_do_flush(net, !in_softirq());
>  }
>  
>  /* Flush previous cache invalidated entries from the cache */
> -void rt_cache_flush_batch(void)
> +void rt_cache_flush_batch(struct net *net)
>  {
> -	rt_do_flush(!in_softirq());
> +	rt_do_flush(net, !in_softirq());
>  }
>  
>  static void rt_emergency_hash_rebuild(struct net *net)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Oct. 14, 2010, 4:50 a.m. UTC | #5
From: ebiederm@xmission.com (Eric W. Biederman)
Date: Wed, 13 Oct 2010 21:40:49 -0700

> However I think the test should still be rt_is_expired(), because
> that is what rt_do_flush() is doing removing the expired entries
> from the list.

I can't see a reason for that test.

Everything calling into this code path has created a condition
that requires that all routing cache entries for that namespace
be deleted.

This function is meant to unconditionally flush the entire table.

I believe you added that extraneous test, and it never existed there
before.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Oct. 14, 2010, 5:20 a.m. UTC | #6
David Miller <davem@davemloft.net> writes:

> From: ebiederm@xmission.com (Eric W. Biederman)
> Date: Wed, 13 Oct 2010 21:40:49 -0700
>
>> However I think the test should still be rt_is_expired(), because
>> that is what rt_do_flush() is doing removing the expired entries
>> from the list.
>
> I can't see a reason for that test.
>
> Everything calling into this code path has created a condition
> that requires that all routing cache entries for that namespace
> be deleted.
>
> This function is meant to unconditionally flush the entire table.
>
> I believe you added that extraneous test, and it never existed there
> before.

At the point network namespaces entered the picture the logic was:

	void rt_cache_flush(struct net *net, int delay)
	{
		rt_cache_invalidate();
		if (delay >= 0)
			rt_do_flush(!in_softirq());
	}
	
	/* Strictly speaking rt_is_expired was just open coded in
	 * rt_check_expire. But this is the check that was used.
	 */
	static inline int rt_is_expired(struct rtable *rth)
	{
		return rth->rt_genid != atomic_read(&rt_genid);
	}
	
	static void rt_cache_invalidate(void)
	{
	        unsigned char shuffle;
	 
		get_random_bytes(&shuffle, sizeof(shuffle));
		atomic_add(shuffle + 1U, &rt_genid);
	}
	
	static void rt_do_flush(int process_context)
	{
	        unsigned int i;
	        struct rtable *rth, *next;
	
	        for (i = 0; i <= rt_hash_mask; i++) {
			if (process_context && need_resched())
				cond_resched();
			rth = rt_hash_table[i].chain;
			if (!rth)
				continue;
			
			spin_lock_bh(rt_hash_lock_addr(i));
			rth = rt_hash_table[i].chain;
			rt_hash_table[i].chain = NULL;
			tail = NULL;
			spin_unlock_bh(rt_hash_lock_addr(i));
			
			for(; rth != tail; rth = next)
			{
				next = rth->dst.rt_next;
				rt_free(rth);
			}
		}
	}

Because of the rt_cache_invalidate() in rt_cache_flush() this
guaranteed that rt_is_expired() was true for every route cache entry,
and this also guaranteed that every routing cache entry we were flush
atomically became inaccessible.

So rt_is_expired() has always been valid, but in practice it was just
always optimized out as being redundant.

With the network namespace support we limit the scope of the test of
the invalidate to just a single network namespace, and as such
rt_is_expired stops being true for every cache entry.  So we cannot
unconditionally throw away entire chains.

All of which can be either done by network namespace equality or by
rt_is_expired().  Although Denis picked rt_is_expired() when he made
his change.

The only place it makes a noticable difference in practice is what
happens when we do batched deleletes of lots of network devices in
different network namespaces.

During batched network device deletes in fib_netdev_event we do
rt_cache_flush(dev_net(dev), -1) for each network device.  and then a
final rt_cache_flush_batch() to remove the invalidated entries.  These
devices can be from multiple network namespaces, so I suspect that is
a savings worth having.

So if we are going to change the tests we need to do something with
rt_cache_flush_batch().  Further I do not see what is confusing about
a test that asks if the routing cache entry is unusable.  Is
rt_cache_expired() a bad name?

Eric

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hans Schillstrom Oct. 14, 2010, 6:41 a.m. UTC | #7
On Wednesday 13 October 2010 23:58:56 David Miller wrote:
> From: Jarek Poplawski <jarkao2@gmail.com>
> Date: Wed, 13 Oct 2010 11:19:47 +0000
>
> >> -static void rt_do_flush(int process_context)
> >> +static void rt_do_flush(struct net *net, int process_context)
> >>  {
> >>  	unsigned int i;
> >>  	struct rtable *rth, *next;
> >> -	struct rtable * tail;
> >>
> >>  	for (i = 0; i <= rt_hash_mask; i++) {
> >> +		struct rtable *list, **pprev;
> >
> > Isn't "list = NULL" needed here?
>
> Yes it is, thanks for catching that.
>
It solves the crach but....

 #
Slab corruption: size-4096 start=ffff88000f950000, len=4096
010: 00 00 00 00 00 00 00 00 6b 6b 6b 6b 6b 6b 6b 6b
unregister_netdevice: waiting for lo to become free. Usage count = 4
Slab corruption: size-4096 start=ffff88000f9af000, len=4096
010: 00 00 00 00 00 00 00 00 6b 6b 6b 6b 6b 6b 6b 6b
unregister_netdevice: waiting for lo to become free. Usage count = 4
unregister_netdevice: waiting for lo to become free. Usage count = 4
unregister_netdevice: waiting for lo to become free. Usage count = 4

Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Oct. 14, 2010, 3:09 p.m. UTC | #8
From: ebiederm@xmission.com (Eric W. Biederman)
Date: Wed, 13 Oct 2010 22:20:28 -0700

> With the network namespace support we limit the scope of the test of
> the invalidate to just a single network namespace, and as such
> rt_is_expired stops being true for every cache entry.  So we cannot
> unconditionally throw away entire chains.
> 
> All of which can be either done by network namespace equality or by
> rt_is_expired().  Although Denis picked rt_is_expired() when he made
> his change.

Right, and I choose to use namespace equality which will completely
compile into no code at all when namespace support is not in the
kernel.

Therefore, making the non-namespace case equivalent and as efficient
as it always was.

> The only place it makes a noticable difference in practice is what
> happens when we do batched deleletes of lots of network devices in
> different network namespaces.
> 
> During batched network device deletes in fib_netdev_event we do
> rt_cache_flush(dev_net(dev), -1) for each network device.  and then a
> final rt_cache_flush_batch() to remove the invalidated entries.  These
> devices can be from multiple network namespaces, so I suspect that is
> a savings worth having.

How can it make a real difference even in this case?  We'll obliterate
all the entries, and then on subsequent passes we'll find nothing
matching that namespace any more.

Show me performance tests that show it makes any difference, please.

> So if we are going to change the tests we need to do something with
> rt_cache_flush_batch().  Further I do not see what is confusing about
> a test that asks if the routing cache entry is unusable.  Is
> rt_cache_expired() a bad name?

It's not a bad name, it's just an unnecessary test that we don't need
to even make in this specific place.

Redundancy tends to accumulate.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Oct. 14, 2010, 6:35 p.m. UTC | #9
David Miller <davem@davemloft.net> writes:

> From: ebiederm@xmission.com (Eric W. Biederman)
> Date: Wed, 13 Oct 2010 22:20:28 -0700
>
>> With the network namespace support we limit the scope of the test of
>> the invalidate to just a single network namespace, and as such
>> rt_is_expired stops being true for every cache entry.  So we cannot
>> unconditionally throw away entire chains.
>> 
>> All of which can be either done by network namespace equality or by
>> rt_is_expired().  Although Denis picked rt_is_expired() when he made
>> his change.
>
> Right, and I choose to use namespace equality which will completely
> compile into no code at all when namespace support is not in the
> kernel.
>
> Therefore, making the non-namespace case equivalent and as efficient
> as it always was.

Almost you still have the hash list inversion, which means you have
to at look at the rtable entry even on a one list long hash chain.
Perhaps I am looking at it wrong but once you look at the entries
I don't see the difference in the number of cache line faults
between one variant of the code and the other.

>> The only place it makes a noticable difference in practice is what
>> happens when we do batched deleletes of lots of network devices in
>> different network namespaces.
>> 
>> During batched network device deletes in fib_netdev_event we do
>> rt_cache_flush(dev_net(dev), -1) for each network device.  and then a
>> final rt_cache_flush_batch() to remove the invalidated entries.  These
>> devices can be from multiple network namespaces, so I suspect that is
>> a savings worth having.
>
> How can it make a real difference even in this case?  We'll obliterate
> all the entries, and then on subsequent passes we'll find nothing
> matching that namespace any more.
>
> Show me performance tests that show it makes any difference, please.

Octavian did you happen to measure the performance difference when you
added batching of routing table flushes?

>> So if we are going to change the tests we need to do something with
>> rt_cache_flush_batch().  Further I do not see what is confusing about
>> a test that asks if the routing cache entry is unusable.  Is
>> rt_cache_expired() a bad name?
>
> It's not a bad name, it's just an unnecessary test that we don't need
> to even make in this specific place.

As long as we do something that is correct in the batched flush case
I am happy either way.

Eric
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/route.h b/include/net/route.h
index 7e5e73b..8d24761 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -106,7 +106,7 @@  extern int		ip_rt_init(void);
 extern void		ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw,
 				       __be32 src, struct net_device *dev);
 extern void		rt_cache_flush(struct net *net, int how);
-extern void		rt_cache_flush_batch(void);
+extern void		rt_cache_flush_batch(struct net *net);
 extern int		__ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp);
 extern int		ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
 extern int		ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 919f2ad..4039f56 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -999,7 +999,7 @@  static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		rt_cache_flush(dev_net(dev), 0);
 		break;
 	case NETDEV_UNREGISTER_BATCH:
-		rt_cache_flush_batch();
+		rt_cache_flush_batch(dev_net(dev));
 		break;
 	}
 	return NOTIFY_DONE;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0755aa4..6ad730c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -712,13 +712,14 @@  static inline int rt_is_expired(struct rtable *rth)
  * Can be called by a softirq or a process.
  * In the later case, we want to be reschedule if necessary
  */
-static void rt_do_flush(int process_context)
+static void rt_do_flush(struct net *net, int process_context)
 {
 	unsigned int i;
 	struct rtable *rth, *next;
-	struct rtable * tail;
 
 	for (i = 0; i <= rt_hash_mask; i++) {
+		struct rtable *list, **pprev;
+
 		if (process_context && need_resched())
 			cond_resched();
 		rth = rt_hash_table[i].chain;
@@ -726,41 +727,27 @@  static void rt_do_flush(int process_context)
 			continue;
 
 		spin_lock_bh(rt_hash_lock_addr(i));
-#ifdef CONFIG_NET_NS
-		{
-		struct rtable ** prev, * p;
 
-		rth = rt_hash_table[i].chain;
+		pprev = &rt_hash_table[i].chain;
+		rth = *pprev;
+		while (rth) {
+			next = rth->dst.rt_next;
+			if (dev_net(rth->dst.dev) == net) {
+				*pprev = next;
 
-		/* defer releasing the head of the list after spin_unlock */
-		for (tail = rth; tail; tail = tail->dst.rt_next)
-			if (!rt_is_expired(tail))
-				break;
-		if (rth != tail)
-			rt_hash_table[i].chain = tail;
-
-		/* call rt_free on entries after the tail requiring flush */
-		prev = &rt_hash_table[i].chain;
-		for (p = *prev; p; p = next) {
-			next = p->dst.rt_next;
-			if (!rt_is_expired(p)) {
-				prev = &p->dst.rt_next;
-			} else {
-				*prev = next;
-				rt_free(p);
-			}
-		}
+				rth->dst.rt_next = list;
+				list = rth;
+			} else
+				pprev = &rth->dst.rt_next;
+
+			rth = next;
 		}
-#else
-		rth = rt_hash_table[i].chain;
-		rt_hash_table[i].chain = NULL;
-		tail = NULL;
-#endif
+
 		spin_unlock_bh(rt_hash_lock_addr(i));
 
-		for (; rth != tail; rth = next) {
-			next = rth->dst.rt_next;
-			rt_free(rth);
+		for (; list; list = next) {
+			next = list->dst.rt_next;
+			rt_free(list);
 		}
 	}
 }
@@ -906,13 +893,13 @@  void rt_cache_flush(struct net *net, int delay)
 {
 	rt_cache_invalidate(net);
 	if (delay >= 0)
-		rt_do_flush(!in_softirq());
+		rt_do_flush(net, !in_softirq());
 }
 
 /* Flush previous cache invalidated entries from the cache */
-void rt_cache_flush_batch(void)
+void rt_cache_flush_batch(struct net *net)
 {
-	rt_do_flush(!in_softirq());
+	rt_do_flush(net, !in_softirq());
 }
 
 static void rt_emergency_hash_rebuild(struct net *net)