diff mbox

[PATCHv4,net-next] xfrm: Namespacify xfrm_policy_sk_bundles

Message ID 52B3BAD1.30205@windriver.com
State Awaiting Upstream, archived
Delegated to: David Miller
Headers show

Commit Message

fan.du Dec. 20, 2013, 3:34 a.m. UTC
On 2013年12月19日 11:44, Eric Dumazet wrote:
> On Thu, 2013-12-19 at 11:17 +0800, Fan Du wrote:
>>
>
>> diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
>> index 1006a26..22f4f90 100644
>> --- a/include/net/netns/xfrm.h
>> +++ b/include/net/netns/xfrm.h
>> @@ -58,9 +58,9 @@ struct netns_xfrm {
>>    	struct dst_ops		xfrm6_dst_ops;
>>    #endif
>>    	spinlock_t xfrm_state_lock;
>> -	spinlock_t xfrm_policy_sk_bundle_lock;
>>    	rwlock_t xfrm_policy_lock;
>>    	struct mutex xfrm_cfg_mutex;
>> +	struct llist_head xp_sk_bundles_list;
>>    };
>>
>>    #endif
>> diff --git a/include/net/xfrm.h b/include/net/xfrm.h
>> index 59f5d0a..05296ab 100644
>> --- a/include/net/xfrm.h
>> +++ b/include/net/xfrm.h
>> @@ -957,6 +957,7 @@ struct xfrm_dst {
>>    	u32 child_mtu_cached;
>>    	u32 route_cookie;
>>    	u32 path_cookie;
>> +	struct llist_node       xdst_llist;
>>    };
>>
>
> Hmm... Thats not very nice.
>
> Please reuse the storage adding a llist_node in the union ?
>
> diff --git a/include/net/dst.h b/include/net/dst.h
> index 44995c13e941..3f604f47cc58 100644
> --- a/include/net/dst.h
> +++ b/include/net/dst.h
> @@ -14,6 +14,7 @@
>   #include<linux/rcupdate.h>
>   #include<linux/bug.h>
>   #include<linux/jiffies.h>
> +#include<linux/llist.h>
>   #include<net/neighbour.h>
>   #include<asm/processor.h>
>
> @@ -103,6 +104,7 @@ struct dst_entry {
>   		struct rtable __rcu	*rt_next;
>   		struct rt6_info		*rt6_next;
>   		struct dn_route __rcu	*dn_next;
> +		struct llist_node	llist;
>   	};
>   };

Hi, Eric

You are correct on this, have fix the patch wrt your comments.
Thank you very much.


 From 1b929e1cdea978d0c8d10d731af84969bd37004b Mon Sep 17 00:00:00 2001
From: Fan Du <fan.du@windriver.com>
Date: Fri, 20 Dec 2013 11:23:27 +0800
Subject: [PATCHv4 net-next] xfrm: Namespacify xfrm_policy_sk_bundles

xfrm_policy_sk_bundles, protected by net->xfrm.xfrm_policy_sk_bundle_lock
should be put into netns xfrm structure, otherwise xfrm_policy_sk_bundles
can be corrupted from different net namespace.

Moreover current xfrm_policy_sk_bundle_lock used in below two scenarios:

1. xfrm_lookup(Process context)  vs  __xfrm_garbage_collect(softirq context)
2. xfrm_lookup(Process context)  vs  __xfrm_garbage_collect(Process context
                                                 when SPD change or dev down)

We can use lock less list to cater to those two scenarios as suggested by
Eric Dumazet.

Signed-off-by: Fan Du <fan.du@windriver.com>
Assisted-by: Eric Dumazet <edumazet@google.com>
---
v2:
   Fix incorrect commit log.

v3:
   Drop xchg, use llist instead, adviced by Eric Dumazet.

v4:
  Incorporate Eric's comments.
   -Union llist with dst_entry->next, which is used for queuing previous,
    then it's safe to do so.
   -Use llist_for_each_entry_safe for purging list.
   -Rebase with net-next, as ipsec-next got pulled.

---
  include/net/dst.h        |    5 +++++
  include/net/netns/xfrm.h |    2 +-
  net/xfrm/xfrm_policy.c   |   26 ++++++--------------------
  3 files changed, 12 insertions(+), 21 deletions(-)

Comments

fan.du Dec. 24, 2013, 1:12 a.m. UTC | #1
On 2013年12月20日 11:34, Fan Du wrote:
>
>
> On 2013年12月19日 11:44, Eric Dumazet wrote:
>> On Thu, 2013-12-19 at 11:17 +0800, Fan Du wrote:
>>>
>>
>>> diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
>>> index 1006a26..22f4f90 100644
>>> --- a/include/net/netns/xfrm.h
>>> +++ b/include/net/netns/xfrm.h
>>> @@ -58,9 +58,9 @@ struct netns_xfrm {
>>> struct dst_ops xfrm6_dst_ops;
>>> #endif
>>> spinlock_t xfrm_state_lock;
>>> - spinlock_t xfrm_policy_sk_bundle_lock;
>>> rwlock_t xfrm_policy_lock;
>>> struct mutex xfrm_cfg_mutex;
>>> + struct llist_head xp_sk_bundles_list;
>>> };
>>>
>>> #endif
>>> diff --git a/include/net/xfrm.h b/include/net/xfrm.h
>>> index 59f5d0a..05296ab 100644
>>> --- a/include/net/xfrm.h
>>> +++ b/include/net/xfrm.h
>>> @@ -957,6 +957,7 @@ struct xfrm_dst {
>>> u32 child_mtu_cached;
>>> u32 route_cookie;
>>> u32 path_cookie;
>>> + struct llist_node xdst_llist;
>>> };
>>>
>>
>> Hmm... Thats not very nice.
>>
>> Please reuse the storage adding a llist_node in the union ?
>>
>> diff --git a/include/net/dst.h b/include/net/dst.h
>> index 44995c13e941..3f604f47cc58 100644
>> --- a/include/net/dst.h
>> +++ b/include/net/dst.h
>> @@ -14,6 +14,7 @@
>> #include<linux/rcupdate.h>
>> #include<linux/bug.h>
>> #include<linux/jiffies.h>
>> +#include<linux/llist.h>
>> #include<net/neighbour.h>
>> #include<asm/processor.h>
>>
>> @@ -103,6 +104,7 @@ struct dst_entry {
>> struct rtable __rcu *rt_next;
>> struct rt6_info *rt6_next;
>> struct dn_route __rcu *dn_next;
>> + struct llist_node llist;
>> };
>> };
>
> Hi, Eric
>
> You are correct on this, have fix the patch wrt your comments.
> Thank you very much.
>
>
>  From 1b929e1cdea978d0c8d10d731af84969bd37004b Mon Sep 17 00:00:00 2001
> From: Fan Du <fan.du@windriver.com>
> Date: Fri, 20 Dec 2013 11:23:27 +0800
> Subject: [PATCHv4 net-next] xfrm: Namespacify xfrm_policy_sk_bundles
>
> xfrm_policy_sk_bundles, protected by net->xfrm.xfrm_policy_sk_bundle_lock
> should be put into netns xfrm structure, otherwise xfrm_policy_sk_bundles
> can be corrupted from different net namespace.

Hi Dave

I saw this patch is marked 'Not Applicable' in patchwork.

Not sure what's the improperness in it, corruption of xfrm_policy_sk_bundles
from different netns is real problem here. If you don't like below lock less
addition to avoid original spinlock, I could drop this part if you would require
and then put xfrm_policy_sk_bundles in net.xfrm only.

Could you please comment on this?
Thanks a lot :)

> Moreover current xfrm_policy_sk_bundle_lock used in below two scenarios:
>
> 1. xfrm_lookup(Process context) vs __xfrm_garbage_collect(softirq context)
> 2. xfrm_lookup(Process context) vs __xfrm_garbage_collect(Process context
> when SPD change or dev down)
>
> We can use lock less list to cater to those two scenarios as suggested by
> Eric Dumazet.
>
> Signed-off-by: Fan Du <fan.du@windriver.com>
> Assisted-by: Eric Dumazet <edumazet@google.com>
> ---
> v2:
> Fix incorrect commit log.
>
> v3:
> Drop xchg, use llist instead, adviced by Eric Dumazet.
>
> v4:
> Incorporate Eric's comments.
> -Union llist with dst_entry->next, which is used for queuing previous,
> then it's safe to do so.
> -Use llist_for_each_entry_safe for purging list.
> -Rebase with net-next, as ipsec-next got pulled.
>
> ---
> include/net/dst.h | 5 +++++
> include/net/netns/xfrm.h | 2 +-
> net/xfrm/xfrm_policy.c | 26 ++++++--------------------
> 3 files changed, 12 insertions(+), 21 deletions(-)
>
> diff --git a/include/net/dst.h b/include/net/dst.h
> index 77eb53f..e9b81f0 100644
> --- a/include/net/dst.h
> +++ b/include/net/dst.h
> @@ -14,6 +14,7 @@
> #include <linux/rcupdate.h>
> #include <linux/bug.h>
> #include <linux/jiffies.h>
> +#include <linux/llist.h>
> #include <net/neighbour.h>
> #include <asm/processor.h>
>
> @@ -103,6 +104,10 @@ struct dst_entry {
> struct rtable __rcu *rt_next;
> struct rt6_info *rt6_next;
> struct dn_route __rcu *dn_next;
> +#ifdef CONFIG_XFRM
> + /* Used to queue each policy sk dst */
> + struct llist_node llist;
> +#endif
> };
> };
>
> diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
> index 1006a26..22f4f90 100644
> --- a/include/net/netns/xfrm.h
> +++ b/include/net/netns/xfrm.h
> @@ -58,9 +58,9 @@ struct netns_xfrm {
> struct dst_ops xfrm6_dst_ops;
> #endif
> spinlock_t xfrm_state_lock;
> - spinlock_t xfrm_policy_sk_bundle_lock;
> rwlock_t xfrm_policy_lock;
> struct mutex xfrm_cfg_mutex;
> + struct llist_head xp_sk_bundles_list;
> };
>
> #endif
> diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
> index a7487f3..0cc7446 100644
> --- a/net/xfrm/xfrm_policy.c
> +++ b/net/xfrm/xfrm_policy.c
> @@ -39,8 +39,6 @@
> #define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ))
> #define XFRM_MAX_QUEUE_LEN 100
>
> -static struct dst_entry *xfrm_policy_sk_bundles;
> -
> static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
> static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
> __read_mostly;
> @@ -2108,12 +2106,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
> }
>
> dst_hold(&xdst->u.dst);
> -
> - spin_lock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
> - xdst->u.dst.next = xfrm_policy_sk_bundles;
> - xfrm_policy_sk_bundles = &xdst->u.dst;
> - spin_unlock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
> -
> + llist_add(&xdst->u.dst.llist, &net->xfrm.xp_sk_bundles_list);
> route = xdst->route;
> }
> }
> @@ -2549,18 +2542,12 @@ static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)
>
> static void __xfrm_garbage_collect(struct net *net)
> {
> - struct dst_entry *head, *next;
> + struct llist_node *head;
> + struct dst_entry *dst, *tmp;
>
> - spin_lock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
> - head = xfrm_policy_sk_bundles;
> - xfrm_policy_sk_bundles = NULL;
> - spin_unlock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
> -
> - while (head) {
> - next = head->next;
> - dst_free(head);
> - head = next;
> - }
> + head = llist_del_all(&net->xfrm.xp_sk_bundles_list);
> + llist_for_each_entry_safe(dst, tmp, head, llist)
> + dst_free(dst);
> }
>
> void xfrm_garbage_collect(struct net *net)
> @@ -2942,7 +2929,6 @@ static int __net_init xfrm_net_init(struct net *net)
> /* Initialize the per-net locks here */
> spin_lock_init(&net->xfrm.xfrm_state_lock);
> rwlock_init(&net->xfrm.xfrm_policy_lock);
> - spin_lock_init(&net->xfrm.xfrm_policy_sk_bundle_lock);
> mutex_init(&net->xfrm.xfrm_cfg_mutex);
>
> return 0;
David Miller Dec. 24, 2013, 5:31 a.m. UTC | #2
From: Fan Du <fan.du@windriver.com>
Date: Tue, 24 Dec 2013 09:12:32 +0800

> I saw this patch is marked 'Not Applicable' in patchwork.

It means that it doesn't go in via my tree directly, it
"doesn't apply".

Instead, it goes into Steffen's IPSEC tree.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
fan.du Dec. 24, 2013, 5:39 a.m. UTC | #3
On 2013年12月24日 13:31, David Miller wrote:
> From: Fan Du<fan.du@windriver.com>
> Date: Tue, 24 Dec 2013 09:12:32 +0800
>
>> >  I saw this patch is marked 'Not Applicable' in patchwork.
> It means that it doesn't go in via my tree directly, it
> "doesn't apply".
>
> Instead, it goes into Steffen's IPSEC tree.
>

Thanks for your explanation!!!

I'm not well understand the curtsy about patchwork, and one last question,
[PATCHv4 net-next 0/8] pktgen IPsec support is tagged as 'Awaiting Upstream'
what does that mean and do I need to take further action about this patch set?
Steffen Klassert Dec. 24, 2013, 9:50 a.m. UTC | #4
On Tue, Dec 24, 2013 at 01:39:45PM +0800, Fan Du wrote:
> 
> 
> On 2013年12月24日 13:31, David Miller wrote:
> >From: Fan Du<fan.du@windriver.com>
> >Date: Tue, 24 Dec 2013 09:12:32 +0800
> >
> >>>  I saw this patch is marked 'Not Applicable' in patchwork.
> >It means that it doesn't go in via my tree directly, it
> >"doesn't apply".
> >
> >Instead, it goes into Steffen's IPSEC tree.
> >

Yes, I'll take care about this one. I'm currently traveling,
so I have not that much time for review and tests of patches.
I'll integrate all pending ipsec patches after the holidays.

> 
> Thanks for your explanation!!!
> 
> I'm not well understand the curtsy about patchwork, and one last question,
> [PATCHv4 net-next 0/8] pktgen IPsec support is tagged as 'Awaiting Upstream'
> what does that mean and do I need to take further action about this patch set?
> 

I guess that means the same, it goes into the ipsec-next tree :)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
fan.du Dec. 24, 2013, 9:56 a.m. UTC | #5
On 2013年12月24日 17:50, Steffen Klassert wrote:
> On Tue, Dec 24, 2013 at 01:39:45PM +0800, Fan Du wrote:
>>
>>
>> On 2013年12月24日 13:31, David Miller wrote:
>>> From: Fan Du<fan.du@windriver.com>
>>> Date: Tue, 24 Dec 2013 09:12:32 +0800
>>>
>>>>>   I saw this patch is marked 'Not Applicable' in patchwork.
>>> It means that it doesn't go in via my tree directly, it
>>> "doesn't apply".
>>>
>>> Instead, it goes into Steffen's IPSEC tree.
>>>
>
> Yes, I'll take care about this one. I'm currently traveling,
> so I have not that much time for review and tests of patches.
> I'll integrate all pending ipsec patches after the holidays.
>
>>
>> Thanks for your explanation!!!
>>
>> I'm not well understand the curtsy about patchwork, and one last question,
>> [PATCHv4 net-next 0/8] pktgen IPsec support is tagged as 'Awaiting Upstream'
>> what does that mean and do I need to take further action about this patch set?
>>
>
> I guess that means the same, it goes into the ipsec-next tree :)

Ok, I understood, and Merry Christmas!
Steffen Klassert Dec. 24, 2013, 10:35 a.m. UTC | #6
On Fri, Dec 20, 2013 at 11:34:41AM +0800, Fan Du wrote:
> 
> Subject: [PATCHv4 net-next] xfrm: Namespacify xfrm_policy_sk_bundles
> 
> xfrm_policy_sk_bundles, protected by net->xfrm.xfrm_policy_sk_bundle_lock
> should be put into netns xfrm structure, otherwise xfrm_policy_sk_bundles
> can be corrupted from different net namespace.

I'm ok with this patch, but I wonder where we use these cached socket
bundles. After a quick look I see where we add and where we delete
them, but I can't see how we use these cached bundles. 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Dec. 24, 2013, 5:54 p.m. UTC | #7
From: Fan Du <fan.du@windriver.com>
Date: Tue, 24 Dec 2013 13:39:45 +0800

> 
> 
> On 2013年12月24日 13:31, David Miller wrote:
>> From: Fan Du<fan.du@windriver.com>
>> Date: Tue, 24 Dec 2013 09:12:32 +0800
>>
>>> >  I saw this patch is marked 'Not Applicable' in patchwork.
>> It means that it doesn't go in via my tree directly, it
>> "doesn't apply".
>>
>> Instead, it goes into Steffen's IPSEC tree.
>>
> 
> Thanks for your explanation!!!
> 
> I'm not well understand the curtsy about patchwork, and one last
> question,
> [PATCHv4 net-next 0/8] pktgen IPsec support is tagged as 'Awaiting
> Upstream'
> what does that mean and do I need to take further action about this
> patch set?

It means the same thing, that I expect to receive this change via
upstream, which is Steffen's IPSEC tree.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
fan.du Dec. 25, 2013, 6:40 a.m. UTC | #8
ccing Timo

On 2013年12月24日 18:35, Steffen Klassert wrote:
> On Fri, Dec 20, 2013 at 11:34:41AM +0800, Fan Du wrote:
>>
>> Subject: [PATCHv4 net-next] xfrm: Namespacify xfrm_policy_sk_bundles
>>
>> xfrm_policy_sk_bundles, protected by net->xfrm.xfrm_policy_sk_bundle_lock
>> should be put into netns xfrm structure, otherwise xfrm_policy_sk_bundles
>> can be corrupted from different net namespace.
>
> I'm ok with this patch, but I wonder where we use these cached socket
> bundles. After a quick look I see where we add and where we delete
> them, but I can't see how we use these cached bundles.

Interesting

The per socket bundles is introduced by Timo in commit 80c802f3
("xfrm: cache bundles instead of policies for outgoing flows")

But one fundamental question is why not use existing flow cache
for per socket bundles as well? then no need to create such per
sock xdst for every packet, and also share the same flow cache
flush mechanism.

My first impression is it can be done this way, I'm going to head
this way unless turn out otherwise.

So Timo ?
Timo Teras Dec. 25, 2013, 8:11 a.m. UTC | #9
On Wed, 25 Dec 2013 14:40:36 +0800
Fan Du <fan.du@windriver.com> wrote:

> ccing Timo
> 
> On 2013年12月24日 18:35, Steffen Klassert wrote:
> > On Fri, Dec 20, 2013 at 11:34:41AM +0800, Fan Du wrote:
> >>
> >> Subject: [PATCHv4 net-next] xfrm: Namespacify
> >> xfrm_policy_sk_bundles
> >>
> >> xfrm_policy_sk_bundles, protected by
> >> net->xfrm.xfrm_policy_sk_bundle_lock should be put into netns xfrm
> >> structure, otherwise xfrm_policy_sk_bundles can be corrupted from
> >> different net namespace.
> >
> > I'm ok with this patch, but I wonder where we use these cached
> > socket bundles. After a quick look I see where we add and where we
> > delete them, but I can't see how we use these cached bundles.
> 
> Interesting
> 
> The per socket bundles is introduced by Timo in commit 80c802f3
> ("xfrm: cache bundles instead of policies for outgoing flows")

Those existed even before. I just did systematic transformation of the
caching code to work on bundle level instead of policy level.

> But one fundamental question is why not use existing flow cache
> for per socket bundles as well? then no need to create such per
> sock xdst for every packet, and also share the same flow cache
> flush mechanism.

It was needed when the flow cache cached policies. They explicitly
needed to check the socket for per-socket policy. So it made no sense
to have anything socket related in the cache.

> My first impression is it can be done this way, I'm going to head
> this way unless turn out otherwise.

I think it could converted. You'll still need to look up the per-socket
policies. But if caching the bundles in flow cache simplifies overall
code it sounds like a good thing to do.

- Timo
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
fan.du Dec. 25, 2013, 8:44 a.m. UTC | #10
On 2013年12月25日 16:11, Timo Teras wrote:
> On Wed, 25 Dec 2013 14:40:36 +0800
> Fan Du<fan.du@windriver.com>  wrote:
>
>> >  ccing Timo
>> >
>> >  On 2013年12月24日 18:35, Steffen Klassert wrote:
>>> >  >  On Fri, Dec 20, 2013 at 11:34:41AM +0800, Fan Du wrote:
>>>> >  >>
>>>> >  >>  Subject: [PATCHv4 net-next] xfrm: Namespacify
>>>> >  >>  xfrm_policy_sk_bundles
>>>> >  >>
>>>> >  >>  xfrm_policy_sk_bundles, protected by
>>>> >  >>  net->xfrm.xfrm_policy_sk_bundle_lock should be put into netns xfrm
>>>> >  >>  structure, otherwise xfrm_policy_sk_bundles can be corrupted from
>>>> >  >>  different net namespace.
>>> >  >
>>> >  >  I'm ok with this patch, but I wonder where we use these cached
>>> >  >  socket bundles. After a quick look I see where we add and where we
>>> >  >  delete them, but I can't see how we use these cached bundles.
>> >
>> >  Interesting
>> >
>> >  The per socket bundles is introduced by Timo in commit 80c802f3
>> >  ("xfrm: cache bundles instead of policies for outgoing flows")
> Those existed even before. I just did systematic transformation of the
> caching code to work on bundle level instead of policy level.

Apologizes and thanks for your quick reply :)

>> >  But one fundamental question is why not use existing flow cache
>> >  for per socket bundles as well? then no need to create such per
>> >  sock xdst for every packet, and also share the same flow cache
>> >  flush mechanism.
> It was needed when the flow cache cached policies. They explicitly
> needed to check the socket for per-socket policy. So it made no sense
> to have anything socket related in the cache.

I understand your concern.

per sk bundles could be distinguished by putting per sk policy pointer into
struct flow_cache_entry, and then compare sk policy between cached policy
against with sk policy.

And I also notice flow cache is global across different namespaces, but flow
cache flush is doing a per-cpu(also global) operation, that's not fair for
slim netns as compared with fat netns which floods flow cache. Maybe it's
time to make flow cache also name space aware.

Let's see what Steffen think about it.
Steffen Klassert Jan. 6, 2014, 10:35 a.m. UTC | #11
On Wed, Dec 25, 2013 at 04:44:26PM +0800, Fan Du wrote:
> 
> 
> On 2013年12月25日 16:11, Timo Teras wrote:
> >On Wed, 25 Dec 2013 14:40:36 +0800
> >Fan Du<fan.du@windriver.com>  wrote:
> >
> >>>  ccing Timo
> >>>
> >>>  On 2013年12月24日 18:35, Steffen Klassert wrote:
> >>>>  >  On Fri, Dec 20, 2013 at 11:34:41AM +0800, Fan Du wrote:
> >>>>>  >>
> >>>>>  >>  Subject: [PATCHv4 net-next] xfrm: Namespacify
> >>>>>  >>  xfrm_policy_sk_bundles
> >>>>>  >>
> >>>>>  >>  xfrm_policy_sk_bundles, protected by
> >>>>>  >>  net->xfrm.xfrm_policy_sk_bundle_lock should be put into netns xfrm
> >>>>>  >>  structure, otherwise xfrm_policy_sk_bundles can be corrupted from
> >>>>>  >>  different net namespace.
> >>>>  >
> >>>>  >  I'm ok with this patch, but I wonder where we use these cached
> >>>>  >  socket bundles. After a quick look I see where we add and where we
> >>>>  >  delete them, but I can't see how we use these cached bundles.
> >>>
> >>>  Interesting
> >>>
> >>>  The per socket bundles is introduced by Timo in commit 80c802f3
> >>>  ("xfrm: cache bundles instead of policies for outgoing flows")
> >Those existed even before. I just did systematic transformation of the
> >caching code to work on bundle level instead of policy level.
> 
> Apologizes and thanks for your quick reply :)
> 
> >>>  But one fundamental question is why not use existing flow cache
> >>>  for per socket bundles as well? then no need to create such per
> >>>  sock xdst for every packet, and also share the same flow cache
> >>>  flush mechanism.
> >It was needed when the flow cache cached policies. They explicitly
> >needed to check the socket for per-socket policy. So it made no sense
> >to have anything socket related in the cache.
> 
> I understand your concern.
> 
> per sk bundles could be distinguished by putting per sk policy pointer into
> struct flow_cache_entry, and then compare sk policy between cached policy
> against with sk policy.

Most protocols cache the used routes at the sockets, so I'm not sure if
we really need to cache them in xfrm too.

Given the fact that we don't use these cached socket policy bundles,
it would be already an improvement if we would simply remove this caching.
All we are doing here is wasting memory.

> 
> And I also notice flow cache is global across different namespaces, but flow
> cache flush is doing a per-cpu(also global) operation, that's not fair for
> slim netns as compared with fat netns which floods flow cache. Maybe it's
> time to make flow cache also name space aware.

Yes, making the flow cache namespace aware would be a good thing.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
fan.du Jan. 7, 2014, 2:43 a.m. UTC | #12
On 2014年01月06日 18:35, Steffen Klassert wrote:
> On Wed, Dec 25, 2013 at 04:44:26PM +0800, Fan Du wrote:
>>
>>
>> On 2013年12月25日 16:11, Timo Teras wrote:
>>> On Wed, 25 Dec 2013 14:40:36 +0800
>>> Fan Du<fan.du@windriver.com>   wrote:
>>>
>>>>>   ccing Timo
>>>>>
>>>>>   On 2013年12月24日 18:35, Steffen Klassert wrote:
>>>>>>   >   On Fri, Dec 20, 2013 at 11:34:41AM +0800, Fan Du wrote:
>>>>>>>   >>
>>>>>>>   >>   Subject: [PATCHv4 net-next] xfrm: Namespacify
>>>>>>>   >>   xfrm_policy_sk_bundles
>>>>>>>   >>
>>>>>>>   >>   xfrm_policy_sk_bundles, protected by
>>>>>>>   >>   net->xfrm.xfrm_policy_sk_bundle_lock should be put into netns xfrm
>>>>>>>   >>   structure, otherwise xfrm_policy_sk_bundles can be corrupted from
>>>>>>>   >>   different net namespace.
>>>>>>   >
>>>>>>   >   I'm ok with this patch, but I wonder where we use these cached
>>>>>>   >   socket bundles. After a quick look I see where we add and where we
>>>>>>   >   delete them, but I can't see how we use these cached bundles.
>>>>>
>>>>>   Interesting
>>>>>
>>>>>   The per socket bundles is introduced by Timo in commit 80c802f3
>>>>>   ("xfrm: cache bundles instead of policies for outgoing flows")
>>> Those existed even before. I just did systematic transformation of the
>>> caching code to work on bundle level instead of policy level.
>>
>> Apologizes and thanks for your quick reply :)
>>
>>>>>   But one fundamental question is why not use existing flow cache
>>>>>   for per socket bundles as well? then no need to create such per
>>>>>   sock xdst for every packet, and also share the same flow cache
>>>>>   flush mechanism.
>>> It was needed when the flow cache cached policies. They explicitly
>>> needed to check the socket for per-socket policy. So it made no sense
>>> to have anything socket related in the cache.
>>
>> I understand your concern.
>>
>> per sk bundles could be distinguished by putting per sk policy pointer into
>> struct flow_cache_entry, and then compare sk policy between cached policy
>> against with sk policy.

Yes, I tested sk policy with udp, when transmit, dst will be cached into sk
by sk_dst_set. Let's leave current implementation as it is.

Please kindly review if there is any concern about v4.

> Most protocols cache the used routes at the sockets, so I'm not sure if
> we really need to cache them in xfrm too.
>
> Given the fact that we don't use these cached socket policy bundles,
> it would be already an improvement if we would simply remove this caching.
> All we are doing here is wasting memory.
>>
>> And I also notice flow cache is global across different namespaces, but flow
>> cache flush is doing a per-cpu(also global) operation, that's not fair for
>> slim netns as compared with fat netns which floods flow cache. Maybe it's
>> time to make flow cache also name space aware.
>
> Yes, making the flow cache namespace aware would be a good thing.
>

I will give it a try :)
diff mbox

Patch

diff --git a/include/net/dst.h b/include/net/dst.h
index 77eb53f..e9b81f0 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -14,6 +14,7 @@ 
  #include <linux/rcupdate.h>
  #include <linux/bug.h>
  #include <linux/jiffies.h>
+#include <linux/llist.h>
  #include <net/neighbour.h>
  #include <asm/processor.h>

@@ -103,6 +104,10 @@  struct dst_entry {
  		struct rtable __rcu	*rt_next;
  		struct rt6_info		*rt6_next;
  		struct dn_route __rcu	*dn_next;
+#ifdef CONFIG_XFRM		
+		/* Used to queue each policy sk dst */
+		struct llist_node       llist;
+#endif
  	};
  };

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 1006a26..22f4f90 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -58,9 +58,9 @@  struct netns_xfrm {
  	struct dst_ops		xfrm6_dst_ops;
  #endif
  	spinlock_t xfrm_state_lock;
-	spinlock_t xfrm_policy_sk_bundle_lock;
  	rwlock_t xfrm_policy_lock;
  	struct mutex xfrm_cfg_mutex;
+	struct llist_head xp_sk_bundles_list;
  };

  #endif
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index a7487f3..0cc7446 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -39,8 +39,6 @@ 
  #define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ))
  #define XFRM_MAX_QUEUE_LEN	100

-static struct dst_entry *xfrm_policy_sk_bundles;
-
  static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
  static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO]
  						__read_mostly;
@@ -2108,12 +2106,7 @@  struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
  			}

  			dst_hold(&xdst->u.dst);
-
-			spin_lock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
-			xdst->u.dst.next = xfrm_policy_sk_bundles;
-			xfrm_policy_sk_bundles = &xdst->u.dst;
-			spin_unlock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
-
+			llist_add(&xdst->u.dst.llist, &net->xfrm.xp_sk_bundles_list);
  			route = xdst->route;
  		}
  	}
@@ -2549,18 +2542,12 @@  static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst)

  static void __xfrm_garbage_collect(struct net *net)
  {
-	struct dst_entry *head, *next;
+	struct llist_node *head;
+	struct dst_entry *dst, *tmp;

-	spin_lock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
-	head = xfrm_policy_sk_bundles;
-	xfrm_policy_sk_bundles = NULL;
-	spin_unlock_bh(&net->xfrm.xfrm_policy_sk_bundle_lock);
-
-	while (head) {
-		next = head->next;
-		dst_free(head);
-		head = next;
-	}
+	head = llist_del_all(&net->xfrm.xp_sk_bundles_list);
+	llist_for_each_entry_safe(dst, tmp, head, llist)
+		dst_free(dst);
  }

  void xfrm_garbage_collect(struct net *net)
@@ -2942,7 +2929,6 @@  static int __net_init xfrm_net_init(struct net *net)
  	/* Initialize the per-net locks here */
  	spin_lock_init(&net->xfrm.xfrm_state_lock);
  	rwlock_init(&net->xfrm.xfrm_policy_lock);
-	spin_lock_init(&net->xfrm.xfrm_policy_sk_bundle_lock);
  	mutex_init(&net->xfrm.xfrm_cfg_mutex);

  	return 0;