Message ID | 1434689355-4088-2-git-send-email-roopa@cumulusnetworks.com |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
On 19/06/15 05:49, Roopa Prabhu wrote: > From: Roopa Prabhu <roopa@cumulusnetworks.com> > > provides ops to parse, build and output encaped > packets for drivers that want to attach tunnel encap > information to routes. > > Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> > --- > include/linux/lwtunnel.h | 6 ++ > include/net/lwtunnel.h | 84 +++++++++++++++++++++ > include/uapi/linux/lwtunnel.h | 11 +++ > net/Kconfig | 5 ++ > net/core/Makefile | 1 + > net/core/lwtunnel.c | 162 +++++++++++++++++++++++++++++++++++++++++ > 6 files changed, 269 insertions(+) > create mode 100644 include/linux/lwtunnel.h > create mode 100644 include/net/lwtunnel.h > create mode 100644 include/uapi/linux/lwtunnel.h > create mode 100644 net/core/lwtunnel.c > > diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h > new file mode 100644 > index 0000000..97f32f8 > --- /dev/null > +++ b/include/linux/lwtunnel.h > @@ -0,0 +1,6 @@ > +#ifndef _LINUX_LWTUNNEL_H_ > +#define _LINUX_LWTUNNEL_H_ > + > +#include <uapi/linux/lwtunnel.h> > + > +#endif /* _LINUX_LWTUNNEL_H_ */ > diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h > new file mode 100644 > index 0000000..649da3c > --- /dev/null > +++ b/include/net/lwtunnel.h > @@ -0,0 +1,84 @@ > +#ifndef __NET_LWTUNNEL_H > +#define __NET_LWTUNNEL_H 1 > + > +#include <linux/lwtunnel.h> > +#include <linux/netdevice.h> > +#include <linux/skbuff.h> > +#include <linux/types.h> > +#include <net/dsfield.h> > +#include <net/ip.h> > +#include <net/rtnetlink.h> > + > +#define LWTUNNEL_HASH_BITS 7 > +#define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) > + > +struct lwtunnel_hdr { > + int len; > + __u8 data[0]; > +}; > + > +/* lw tunnel state flags */ > +#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1 > + > +#define lwtunnel_output_redirect(lwtstate) (lwtstate && \ > + (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT)) This could be made an inline function for type-safety. > + > +struct lwtunnel_state { > + __u16 type; > + __u16 flags; > + atomic_t refcnt; > + struct lwtunnel_hdr tunnel; > +}; > + > +struct lwtunnel_net { > + struct hlist_head tunnels[LWTUNNEL_HASH_SIZE]; > +}; This type doesn't appear to be used in this patch series. Do you intend to use it in a future version? > + > +struct lwtunnel_encap_ops { > + int (*build_state)(struct net_device *dev, struct nlattr *encap, > + struct lwtunnel_state **ts); > + int (*output)(struct sock *sk, struct sk_buff *skb); > + int (*fill_encap)(struct sk_buff *skb, > + struct lwtunnel_state *lwtstate); > + int (*get_encap_size)(struct lwtunnel_state *lwtstate); > +}; > + > +#define MAX_LWTUNNEL_ENCAP_OPS 8 > +extern const struct lwtunnel_encap_ops __rcu * > + lwtun_encaps[MAX_LWTUNNEL_ENCAP_OPS]; > + > +static inline void lwtunnel_state_get(struct lwtunnel_state *lws) > +{ > + atomic_inc(&lws->refcnt); > +} > + > +static inline void lwtunnel_state_put(struct lwtunnel_state *lws) > +{ > + if (!lws) > + return; > + > + if (atomic_dec_and_test(&lws->refcnt)) > + kfree(lws); > +} > + > +static inline struct lwtunnel_state *lwtunnel_skb_lwstate(struct sk_buff *skb) > +{ > + struct rtable *rt = (struct rtable *)skb_dst(skb); > + > + return rt->rt_lwtstate; > +} It doesn't look like this patch will build on its own because rt_lwtstate isn't added to struct rtable until patch 2. More importantly, is it safe to assume that skb_dst will always return an IPv4 dst? How will this look when IPv6 support is added? > + > +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, > + unsigned int num); > +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, > + unsigned int num); > +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, > + struct nlattr *encap, > + struct lwtunnel_state **lws); > +int lwtunnel_fill_encap(struct sk_buff *skb, > + struct lwtunnel_state *lwtstate); > +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); > +struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); > +int lwtunnel_output(struct sock *sk, struct sk_buff *skb); > + > +#endif /* __NET_LWTUNNEL_H */ ... > diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c > new file mode 100644 > index 0000000..29c7802 > --- /dev/null > +++ b/net/core/lwtunnel.c > @@ -0,0 +1,162 @@ > +/* > + * lwtunnel Infrastructure for light weight tunnels like mpls > + * > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License > + * as published by the Free Software Foundation; either version > + * 2 of the License, or (at your option) any later version. > + * > + */ > +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt > + > +#include <linux/capability.h> > +#include <linux/module.h> > +#include <linux/types.h> > +#include <linux/kernel.h> > +#include <linux/slab.h> > +#include <linux/uaccess.h> > +#include <linux/skbuff.h> > +#include <linux/netdevice.h> > +#include <linux/in.h> > +#include <linux/init.h> > +#include <linux/err.h> > + > +#include <net/lwtunnel.h> > +#include <net/rtnetlink.h> > + > +struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len) > +{ > + struct lwtunnel_state *lws; > + > + return kzalloc(sizeof(*lws) + hdr_len, GFP_KERNEL); > +} > +EXPORT_SYMBOL(lwtunnel_state_alloc); > + > +const struct lwtunnel_encap_ops __rcu * > + lwtun_encaps[MAX_LWTUNNEL_ENCAP_OPS] __read_mostly; > + > +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, > + unsigned int num) > +{ > + if (num >= MAX_LWTUNNEL_ENCAP_OPS) > + return -ERANGE; > + > + return !cmpxchg((const struct lwtunnel_encap_ops **) > + &lwtun_encaps[num], > + NULL, ops) ? 0 : -1; > +} > +EXPORT_SYMBOL(lwtunnel_encap_add_ops); > + > +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, > + unsigned int num) > +{ > + int ret; > + > + if (num >= MAX_LWTUNNEL_ENCAP_OPS) > + return -ERANGE; > + > + ret = (cmpxchg((const struct lwtunnel_encap_ops **) > + &lwtun_encaps[num], > + ops, NULL) == ops) ? 0 : -1; > + > + synchronize_net(); > + > + return ret; > +} > +EXPORT_SYMBOL(lwtunnel_encap_del_ops); > + > +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, > + struct nlattr *encap, struct lwtunnel_state **lws) > +{ > + const struct lwtunnel_encap_ops *ops; > + int ret = -EINVAL; > + > + if (encap_type == LWTUNNEL_ENCAP_NONE || > + encap_type >= MAX_LWTUNNEL_ENCAP_OPS) > + return ret; > + > + ret = -EOPNOTSUPP; > + rcu_read_lock(); > + ops = rcu_dereference(lwtun_encaps[encap_type]); > + if (likely(ops && ops->build_state)) > + ret = ops->build_state(dev, encap, lws); > + rcu_read_unlock(); > + > + return ret; > +} > +EXPORT_SYMBOL(lwtunnel_build_state); > + > +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) > +{ > + const struct lwtunnel_encap_ops *ops; > + struct nlattr *nest; > + int ret = -EINVAL; > + > + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || > + lwtstate->type >= MAX_LWTUNNEL_ENCAP_OPS) > + return 0; > + > + ret = -EOPNOTSUPP; > + nest = nla_nest_start(skb, RTA_ENCAP); Again, it doesn't look like this will build since RTA_ENCAP isn't added until patch 2. Thanks, Rob -- To unsubscribe from this list: send the line "unsubscribe netdev" in
On 6/19/15, 7:43 AM, Robert Shearman wrote: >> diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h >> new file mode 100644 <snip> >> +/* lw tunnel state flags */ >> +#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1 >> + >> +#define lwtunnel_output_redirect(lwtstate) (lwtstate && \ >> + (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT)) > > This could be made an inline function for type-safety. ack > >> + >> +struct lwtunnel_state { >> + __u16 type; >> + __u16 flags; >> + atomic_t refcnt; >> + struct lwtunnel_hdr tunnel; >> +}; >> + >> +struct lwtunnel_net { >> + struct hlist_head tunnels[LWTUNNEL_HASH_SIZE]; >> +}; > > This type doesn't appear to be used in this patch series. Do you > intend to use it in a future version? ack, will get rid of it > >> >> + >> +static inline struct lwtunnel_state *lwtunnel_skb_lwstate(struct >> sk_buff *skb) >> +{ >> + struct rtable *rt = (struct rtable *)skb_dst(skb); >> + >> + return rt->rt_lwtstate; >> +} > > It doesn't look like this patch will build on its own because > rt_lwtstate isn't added to struct rtable until patch 2. looks like i messed up the patch creation. I will fix that with the next series. > > More importantly, is it safe to assume that skb_dst will always return > an IPv4 dst? How will this look when IPv6 support is added? Today lwtunnel_skb_lwstate is called from lwtunnel_output which is only called from ipv4 code. And my ipv6 variant code was supposed to have a 6 suffix. something like lwtunnel_output6. Or to be more explicit i will probably have variants of the output and skb handling functions like, lwtunnel_output_ipv4 and lwtunnel_output_ipv6. >> + >> + ret = -EOPNOTSUPP; >> + nest = nla_nest_start(skb, RTA_ENCAP); > > Again, it doesn't look like this will build since RTA_ENCAP isn't > added until patch 2. > ack, sorry abt the patch ordering. will fix it. Thanks for the review. -- To unsubscribe from this list: send the line "unsubscribe netdev" in
n 19/06/15 16:14, roopa wrote: > On 6/19/15, 7:43 AM, Robert Shearman wrote: >>> >>> + >>> +static inline struct lwtunnel_state *lwtunnel_skb_lwstate(struct >>> sk_buff *skb) >>> +{ >>> + struct rtable *rt = (struct rtable *)skb_dst(skb); >>> + >>> + return rt->rt_lwtstate; >>> +} >> >> It doesn't look like this patch will build on its own because >> rt_lwtstate isn't added to struct rtable until patch 2. > looks like i messed up the patch creation. I will fix that with the next > series. >> >> More importantly, is it safe to assume that skb_dst will always return >> an IPv4 dst? How will this look when IPv6 support is added? > > Today lwtunnel_skb_lwstate is called from lwtunnel_output which is only > called from ipv4 code. > And my ipv6 variant code was supposed to have a 6 suffix. something like > lwtunnel_output6. > Or to be more explicit i will probably have variants of the output and > skb handling functions like, > lwtunnel_output_ipv4 and lwtunnel_output_ipv6. Do you intend for these functions to be used by netdevices to support the vxlan use case? If so, then how will the netdevice know which one of the two to call? Will there have to be a netdevice for ipv4 and a netdevice for ipv6? If not, could you outline how you intend for it to be implemented? Thanks, Rob -- To unsubscribe from this list: send the line "unsubscribe netdev" in
On 6/19/15, 10:25 AM, Robert Shearman wrote: > n 19/06/15 16:14, roopa wrote: >> Today lwtunnel_skb_lwstate is called from lwtunnel_output which is only >> called from ipv4 code. >> And my ipv6 variant code was supposed to have a 6 suffix. something like >> lwtunnel_output6. >> Or to be more explicit i will probably have variants of the output and >> skb handling functions like, >> lwtunnel_output_ipv4 and lwtunnel_output_ipv6. > > Do you intend for these functions to be used by netdevices to support > the vxlan use case? > > If so, then how will the netdevice know which one of the two to call? > Will there have to be a netdevice for ipv4 and a netdevice for ipv6? > > If not, could you outline how you intend for it to be implemented? In the netdevice case, this output function is not called atall. It should just follow the existing netdevice the route is pointing to. -- To unsubscribe from this list: send the line "unsubscribe netdev" in
On 19/06/15 19:34, roopa wrote: > On 6/19/15, 10:25 AM, Robert Shearman wrote: >> n 19/06/15 16:14, roopa wrote: >>> Today lwtunnel_skb_lwstate is called from lwtunnel_output which is only >>> called from ipv4 code. >>> And my ipv6 variant code was supposed to have a 6 suffix. something like >>> lwtunnel_output6. >>> Or to be more explicit i will probably have variants of the output and >>> skb handling functions like, >>> lwtunnel_output_ipv4 and lwtunnel_output_ipv6. >> >> Do you intend for these functions to be used by netdevices to support >> the vxlan use case? >> >> If so, then how will the netdevice know which one of the two to call? >> Will there have to be a netdevice for ipv4 and a netdevice for ipv6? >> >> If not, could you outline how you intend for it to be implemented? > > In the netdevice case, this output function is not called atall. It > should just follow the existing netdevice the route is pointing to. Sorry for not being clear, but I meant that there would have to be lwtunnel_skb_lwstate functions for ipv4 and ipv6 to match the output functions. So in the vxlan use case where it's using a netdevice, how would it determine which one to call? Thanks, Rob -- To unsubscribe from this list: send the line "unsubscribe netdev" in
On 6/19/15, 11:39 AM, Robert Shearman wrote: > On 19/06/15 19:34, roopa wrote: >> On 6/19/15, 10:25 AM, Robert Shearman wrote: >>> n 19/06/15 16:14, roopa wrote: >>> >> In the netdevice case, this output function is not called atall. It >> should just follow the existing netdevice the route is pointing to. > > Sorry for not being clear, but I meant that there would have to be > lwtunnel_skb_lwstate functions for ipv4 and ipv6 to match the output > functions. So in the vxlan use case where it's using a netdevice, how > would it determine which one to call? thanks for that clarification, and good point. I see some areas of the kernel checking for skb->protocol to do the conversion (something like below). I am guessing that is acceptable. if (skb->protocol == htons(ETH_P_IPV6)) struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); -- To unsubscribe from this list: send the line "unsubscribe netdev" in
<<<snip>>> > diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c > new file mode 100644 > index 0000000..29c7802 > --- /dev/null > +++ b/net/core/lwtunnel.c > @@ -0,0 +1,162 @@ > +/* > + * lwtunnel Infrastructure for light weight tunnels like mpls > + * > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License > + * as published by the Free Software Foundation; either version > + * 2 of the License, or (at your option) any later version. > + * > + */ > +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt > + > +#include <linux/capability.h> > +#include <linux/module.h> > +#include <linux/types.h> > +#include <linux/kernel.h> > +#include <linux/slab.h> > +#include <linux/uaccess.h> > +#include <linux/skbuff.h> > +#include <linux/netdevice.h> > +#include <linux/in.h> > +#include <linux/init.h> > +#include <linux/err.h> > + > +#include <net/lwtunnel.h> > +#include <net/rtnetlink.h> > + > +struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len) > +{ > + struct lwtunnel_state *lws; > + > + return kzalloc(sizeof(*lws) + hdr_len, GFP_KERNEL); This seems to be called with rcu_read_lock so GFP_ATOMIC would have to be used. (Judging by patch 3/3’s mpls_build_state and lwtunnel_build_state) > +} > +EXPORT_SYMBOL(lwtunnel_state_alloc); > + > +const struct lwtunnel_encap_ops __rcu * > + lwtun_encaps[MAX_LWTUNNEL_ENCAP_OPS] __read_mostly; > + > +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, > + unsigned int num) > +{ > + if (num >= MAX_LWTUNNEL_ENCAP_OPS) > + return -ERANGE; > + > + return !cmpxchg((const struct lwtunnel_encap_ops **) > + &lwtun_encaps[num], > + NULL, ops) ? 0 : -1; > +} > +EXPORT_SYMBOL(lwtunnel_encap_add_ops); > + > +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, > + unsigned int num) > +{ > + int ret; > + > + if (num >= MAX_LWTUNNEL_ENCAP_OPS) > + return -ERANGE; > + > + ret = (cmpxchg((const struct lwtunnel_encap_ops **) > + &lwtun_encaps[num], > + ops, NULL) == ops) ? 0 : -1; > + > + synchronize_net(); > + > + return ret; > +} > +EXPORT_SYMBOL(lwtunnel_encap_del_ops); > + > +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, > + struct nlattr *encap, struct lwtunnel_state **lws) > +{ > + const struct lwtunnel_encap_ops *ops; > + int ret = -EINVAL; > + > + if (encap_type == LWTUNNEL_ENCAP_NONE || > + encap_type >= MAX_LWTUNNEL_ENCAP_OPS) > + return ret; > + > + ret = -EOPNOTSUPP; > + rcu_read_lock(); > + ops = rcu_dereference(lwtun_encaps[encap_type]); > + if (likely(ops && ops->build_state)) > + ret = ops->build_state(dev, encap, lws); > + rcu_read_unlock(); > + > + return ret; > +} > +EXPORT_SYMBOL(lwtunnel_build_state); > + > +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) > +{ > + const struct lwtunnel_encap_ops *ops; > + struct nlattr *nest; > + int ret = -EINVAL; > + > + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || > + lwtstate->type >= MAX_LWTUNNEL_ENCAP_OPS) > + return 0; > + > + ret = -EOPNOTSUPP; > + nest = nla_nest_start(skb, RTA_ENCAP); > + rcu_read_lock(); > + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); > + if (likely(ops && ops->fill_encap)) > + ret = ops->fill_encap(skb, lwtstate); > + rcu_read_unlock(); > + > + if (ret) > + goto errout; > + > + nla_nest_end(skb, nest); > + > + return 0; > + > +errout: > + nla_nest_cancel(skb, nest); > + > + return ret; > +} > +EXPORT_SYMBOL(lwtunnel_fill_encap); > + > +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) > +{ > + const struct lwtunnel_encap_ops *ops; > + int ret = 0; > + > + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || > + lwtstate->type >= MAX_LWTUNNEL_ENCAP_OPS) > + return 0; > + > + rcu_read_lock(); > + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); > + if (likely(ops && ops->get_encap_size)) > + ret = nla_total_size(ops->get_encap_size(lwtstate)); > + rcu_read_unlock(); > + > + return ret; > +} > +EXPORT_SYMBOL(lwtunnel_get_encap_size); > + > +int lwtunnel_output(struct sock *sk, struct sk_buff *skb) > +{ > + const struct lwtunnel_encap_ops *ops; > + struct lwtunnel_state *lwtstate = lwtunnel_skb_lwstate(skb); > + int ret = 0; > + > + if (!lwtstate) > + return -EINVAL; > + > + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || > + lwtstate->type >= MAX_LWTUNNEL_ENCAP_OPS) > + return 0; > + > + rcu_read_lock(); > + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); > + if (likely(ops && ops->output)) > + ret = ops->output(sk, skb); > + rcu_read_unlock(); > + > + return ret; > +} > +EXPORT_SYMBOL(lwtunnel_output); > -- > 1.7.10.4 > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe netdev" in
On 06/18/15 at 09:49pm, Roopa Prabhu wrote: > +#include <net/ip.h> > +#include <net/rtnetlink.h> > + > +#define LWTUNNEL_HASH_BITS 7 > +#define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) > + > +struct lwtunnel_hdr { > + int len; > + __u8 data[0]; > +}; The name header is a bit misleading here. Certain encaps won't preallocate the header. How we just add a len to lwt_state and allow the user have private data? Not sure we need to split this into a separate struct anyway. > +/* lw tunnel state flags */ > +#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1 > + > +#define lwtunnel_output_redirect(lwtstate) (lwtstate && \ > + (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT)) Converting this to a static inline function would add type checks by the compiler and it shouldn't result in any different code. > +#define MAX_LWTUNNEL_ENCAP_OPS 8 > +extern const struct lwtunnel_encap_ops __rcu * > + lwtun_encaps[MAX_LWTUNNEL_ENCAP_OPS]; I guess we require everybody to add themselves to the enum so we might as well just derive the MAX from the enum MAX. Unless you want out of tree modules to register themselves. > + > +struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len) > +{ > + struct lwtunnel_state *lws; > + > + return kzalloc(sizeof(*lws) + hdr_len, GFP_KERNEL); Should this set refcnt to 1? -- To unsubscribe from this list: send the line "unsubscribe netdev" in
On 06/20/15 at 07:27am, roopa wrote: > On 6/19/15, 11:39 AM, Robert Shearman wrote: > >On 19/06/15 19:34, roopa wrote: > >>On 6/19/15, 10:25 AM, Robert Shearman wrote: > >>>n 19/06/15 16:14, roopa wrote: > >>> > >>In the netdevice case, this output function is not called atall. It > >>should just follow the existing netdevice the route is pointing to. > > > >Sorry for not being clear, but I meant that there would have to be > >lwtunnel_skb_lwstate functions for ipv4 and ipv6 to match the output > >functions. So in the vxlan use case where it's using a netdevice, how > >would it determine which one to call? > > thanks for that clarification, and good point. I see some areas of the > kernel checking for skb->protocol to do the conversion (something like > below). I am guessing that is acceptable. > if (skb->protocol == htons(ETH_P_IPV6)) > struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); I'm not yet convinced that it makes sense to offer the no-netdevice shortcut for VXLAN. I'm not convinced we need yet another VXLAN data path. In fact, I'm trying to get rid of the OVS one for this specific reason. I have no objection though if somebody comes up with an architecture that can't just pass the required metadata between the namespaces and do the actual encapsulation in a single net_device in the root/host namespace. Either way, I thin it's fair to defer to this to a later point. We don't need to solve this for the first iteration of MPLS and VXLAN implementation. -- To unsubscribe from this list: send the line "unsubscribe netdev" in
On 6/20/15, 9:38 AM, Nikolay Aleksandrov wrote: > <<<snip>>> >> diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c >> new file mode 100644 >> index 0000000..29c7802 >> --- /dev/null >> +++ b/net/core/lwtunnel.c >> @@ -0,0 +1,162 @@ >> +/* >> + * lwtunnel Infrastructure for light weight tunnels like mpls >> + * >> + * >> + * This program is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU General Public License >> + * as published by the Free Software Foundation; either version >> + * 2 of the License, or (at your option) any later version. >> + * >> + */ >> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt >> + >> +#include <linux/capability.h> >> +#include <linux/module.h> >> +#include <linux/types.h> >> +#include <linux/kernel.h> >> +#include <linux/slab.h> >> +#include <linux/uaccess.h> >> +#include <linux/skbuff.h> >> +#include <linux/netdevice.h> >> +#include <linux/in.h> >> +#include <linux/init.h> >> +#include <linux/err.h> >> + >> +#include <net/lwtunnel.h> >> +#include <net/rtnetlink.h> >> + >> +struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len) >> +{ >> + struct lwtunnel_state *lws; >> + >> + return kzalloc(sizeof(*lws) + hdr_len, GFP_KERNEL); > This seems to be called with rcu_read_lock so GFP_ATOMIC would have to > be used. (Judging by patch 3/3’s mpls_build_state and lwtunnel_build_state) > yep, correct. thanks -- To unsubscribe from this list: send the line "unsubscribe netdev" in
On 6/21/15, 1:32 PM, Thomas Graf wrote: > On 06/18/15 at 09:49pm, Roopa Prabhu wrote: >> +#include <net/ip.h> >> +#include <net/rtnetlink.h> >> + >> +#define LWTUNNEL_HASH_BITS 7 >> +#define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) >> + >> +struct lwtunnel_hdr { >> + int len; >> + __u8 data[0]; >> +}; > The name header is a bit misleading here. Certain encaps won't > preallocate the header. How we just add a len to lwt_state and > allow the user have private data? Not sure we need to split this > into a separate struct anyway. sure, I have been debating about that as well. > >> +/* lw tunnel state flags */ >> +#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1 >> + >> +#define lwtunnel_output_redirect(lwtstate) (lwtstate && \ >> + (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT)) > Converting this to a static inline function would add type checks > by the compiler and it shouldn't result in any different code. will do, > >> +#define MAX_LWTUNNEL_ENCAP_OPS 8 >> +extern const struct lwtunnel_encap_ops __rcu * >> + lwtun_encaps[MAX_LWTUNNEL_ENCAP_OPS]; > I guess we require everybody to add themselves to the enum so > we might as well just derive the MAX from the enum MAX. Unless you > want out of tree modules to register themselves. I am ok with deriving the MAX from enum Max. > >> + >> +struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len) >> +{ >> + struct lwtunnel_state *lws; >> + >> + return kzalloc(sizeof(*lws) + hdr_len, GFP_KERNEL); > Should this set refcnt to 1? My alloc does not bump the refcnt but its done right before it is assigned to a nexthop. I was planning on checking the convention followed for this. will check and change if needed. Thanks! -- To unsubscribe from this list: send the line "unsubscribe netdev" in
On 6/21/15, 1:40 PM, Thomas Graf wrote: > On 06/20/15 at 07:27am, roopa wrote: >> On 6/19/15, 11:39 AM, Robert Shearman wrote: >>> >>> Sorry for not being clear, but I meant that there would have to be >>> lwtunnel_skb_lwstate functions for ipv4 and ipv6 to match the output >>> functions. So in the vxlan use case where it's using a netdevice, how >>> would it determine which one to call? >> thanks for that clarification, and good point. I see some areas of the >> kernel checking for skb->protocol to do the conversion (something like >> below). I am guessing that is acceptable. >> if (skb->protocol == htons(ETH_P_IPV6)) >> struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); > I'm not yet convinced that it makes sense to offer the no-netdevice > shortcut for VXLAN. I'm not convinced we need yet another VXLAN data > path. In fact, I'm trying to get rid of the OVS one for this specific > reason. > > I have no objection though if somebody comes up with an architecture > that can't just pass the required metadata between the namespaces and > do the actual encapsulation in a single net_device in the root/host > namespace. > > Either way, I thin it's fair to defer to this to a later point. We > don't need to solve this for the first iteration of MPLS and VXLAN > implementation. ack, thanks for your thoughts on this. -- To unsubscribe from this list: send the line "unsubscribe netdev" in
On 06/18/15 at 09:49pm, Roopa Prabhu wrote: > +static inline struct lwtunnel_state *lwtunnel_skb_lwstate(struct sk_buff *skb) > +{ > + struct rtable *rt = (struct rtable *)skb_dst(skb); > + > + return rt->rt_lwtstate; > +} Noticed while rebasing onto your patches. This needs an ifdef CONFIG_LWTUNNEL. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h new file mode 100644 index 0000000..97f32f8 --- /dev/null +++ b/include/linux/lwtunnel.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_LWTUNNEL_H_ +#define _LINUX_LWTUNNEL_H_ + +#include <uapi/linux/lwtunnel.h> + +#endif /* _LINUX_LWTUNNEL_H_ */ diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h new file mode 100644 index 0000000..649da3c --- /dev/null +++ b/include/net/lwtunnel.h @@ -0,0 +1,84 @@ +#ifndef __NET_LWTUNNEL_H +#define __NET_LWTUNNEL_H 1 + +#include <linux/lwtunnel.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/types.h> +#include <net/dsfield.h> +#include <net/ip.h> +#include <net/rtnetlink.h> + +#define LWTUNNEL_HASH_BITS 7 +#define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) + +struct lwtunnel_hdr { + int len; + __u8 data[0]; +}; + +/* lw tunnel state flags */ +#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1 + +#define lwtunnel_output_redirect(lwtstate) (lwtstate && \ + (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT)) + +struct lwtunnel_state { + __u16 type; + __u16 flags; + atomic_t refcnt; + struct lwtunnel_hdr tunnel; +}; + +struct lwtunnel_net { + struct hlist_head tunnels[LWTUNNEL_HASH_SIZE]; +}; + +struct lwtunnel_encap_ops { + int (*build_state)(struct net_device *dev, struct nlattr *encap, + struct lwtunnel_state **ts); + int (*output)(struct sock *sk, struct sk_buff *skb); + int (*fill_encap)(struct sk_buff *skb, + struct lwtunnel_state *lwtstate); + int (*get_encap_size)(struct lwtunnel_state *lwtstate); +}; + +#define MAX_LWTUNNEL_ENCAP_OPS 8 +extern const struct lwtunnel_encap_ops __rcu * + lwtun_encaps[MAX_LWTUNNEL_ENCAP_OPS]; + +static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +{ + atomic_inc(&lws->refcnt); +} + +static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +{ + if (!lws) + return; + + if (atomic_dec_and_test(&lws->refcnt)) + kfree(lws); +} + +static inline struct lwtunnel_state *lwtunnel_skb_lwstate(struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable *)skb_dst(skb); + + return rt->rt_lwtstate; +} + +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, + unsigned int num); +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, + unsigned int num); +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, + struct lwtunnel_state **lws); +int lwtunnel_fill_encap(struct sk_buff *skb, + struct lwtunnel_state *lwtstate); +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); +struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); +int lwtunnel_output(struct sock *sk, struct sk_buff *skb); + +#endif /* __NET_LWTUNNEL_H */ diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h new file mode 100644 index 0000000..11150c0 --- /dev/null +++ b/include/uapi/linux/lwtunnel.h @@ -0,0 +1,11 @@ +#ifndef _UAPI_LWTUNNEL_H_ +#define _UAPI_LWTUNNEL_H_ + +#include <linux/types.h> + +enum tunnel_encap_types { + LWTUNNEL_ENCAP_NONE, + LWTUNNEL_ENCAP_MPLS, +}; + +#endif /* _UAPI_LWTUNNEL_H_ */ diff --git a/net/Kconfig b/net/Kconfig index 57a7c5a..e296d6f 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -374,9 +374,14 @@ source "net/caif/Kconfig" source "net/ceph/Kconfig" source "net/nfc/Kconfig" +config LWTUNNEL + bool "Network light weight tunnels" + ---help--- + light weight tunnels endif # if NET # Used by archs to tell that they support BPF_JIT config HAVE_BPF_JIT bool + diff --git a/net/core/Makefile b/net/core/Makefile index fec0856..086b01f 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o +obj-$(CONFIG_LWTUNNEL) += lwtunnel.o diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c new file mode 100644 index 0000000..29c7802 --- /dev/null +++ b/net/core/lwtunnel.c @@ -0,0 +1,162 @@ +/* + * lwtunnel Infrastructure for light weight tunnels like mpls + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/capability.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/in.h> +#include <linux/init.h> +#include <linux/err.h> + +#include <net/lwtunnel.h> +#include <net/rtnetlink.h> + +struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len) +{ + struct lwtunnel_state *lws; + + return kzalloc(sizeof(*lws) + hdr_len, GFP_KERNEL); +} +EXPORT_SYMBOL(lwtunnel_state_alloc); + +const struct lwtunnel_encap_ops __rcu * + lwtun_encaps[MAX_LWTUNNEL_ENCAP_OPS] __read_mostly; + +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, + unsigned int num) +{ + if (num >= MAX_LWTUNNEL_ENCAP_OPS) + return -ERANGE; + + return !cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[num], + NULL, ops) ? 0 : -1; +} +EXPORT_SYMBOL(lwtunnel_encap_add_ops); + +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, + unsigned int num) +{ + int ret; + + if (num >= MAX_LWTUNNEL_ENCAP_OPS) + return -ERANGE; + + ret = (cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[num], + ops, NULL) == ops) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_encap_del_ops); + +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, struct lwtunnel_state **lws) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type >= MAX_LWTUNNEL_ENCAP_OPS) + return ret; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + if (likely(ops && ops->build_state)) + ret = ops->build_state(dev, encap, lws); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_build_state); + +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + struct nlattr *nest; + int ret = -EINVAL; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type >= MAX_LWTUNNEL_ENCAP_OPS) + return 0; + + ret = -EOPNOTSUPP; + nest = nla_nest_start(skb, RTA_ENCAP); + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->fill_encap)) + ret = ops->fill_encap(skb, lwtstate); + rcu_read_unlock(); + + if (ret) + goto errout; + + nla_nest_end(skb, nest); + + return 0; + +errout: + nla_nest_cancel(skb, nest); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_fill_encap); + +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type >= MAX_LWTUNNEL_ENCAP_OPS) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->get_encap_size)) + ret = nla_total_size(ops->get_encap_size(lwtstate)); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_get_encap_size); + +int lwtunnel_output(struct sock *sk, struct sk_buff *skb) +{ + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate = lwtunnel_skb_lwstate(skb); + int ret = 0; + + if (!lwtstate) + return -EINVAL; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type >= MAX_LWTUNNEL_ENCAP_OPS) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->output)) + ret = ops->output(sk, skb); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_output);