diff mbox

[RFC,net-next,2/3] ipv4: storing and retrieval of per-nexthop encap

Message ID 1433177175-16775-3-git-send-email-rshearma@brocade.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Robert Shearman June 1, 2015, 4:46 p.m. UTC
Parse RTA_ENCAP attribute for one path and multipath routes. The encap
length is stored in a newly added field to fib_nh, nh_encap_len,
although this is added to a padding hole in the structure so that it
doesn't increase the size at all. The encap data itself is stored at
the end of the array of nexthops. Whilst this means that retrieval
isn't optimal, especially if there are multiple nexthops, this avoids
the memory cost of an extra pointer, as well as any potential change
to the cache or instruction layout that could cause a performance
impact.

Currently, the dst structure allocated to represent the destination of
the packet and used for retrieving the encap by the encap-type
interface has been grown through the addition of the rt_encap_len and
rt_encap fields. This isn't desirable and could be fixed by defining a
new destination type with operations copied from the normal case,
other than the addition of the get_encap operation.

Signed-off-by: Robert Shearman <rshearma@brocade.com>
---
 include/net/ip_fib.h     |   2 +
 include/net/route.h      |   3 +
 net/ipv4/fib_frontend.c  |   3 +
 net/ipv4/fib_lookup.h    |   2 +
 net/ipv4/fib_semantics.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv4/route.c         |  24 +++++++
 6 files changed, 211 insertions(+), 2 deletions(-)

Comments

Roopa Prabhu June 2, 2015, 4:01 p.m. UTC | #1
On 6/1/15, 9:46 AM, Robert Shearman wrote:
> Parse RTA_ENCAP attribute for one path and multipath routes. The encap
> length is stored in a newly added field to fib_nh, nh_encap_len,
> although this is added to a padding hole in the structure so that it
> doesn't increase the size at all. The encap data itself is stored at
> the end of the array of nexthops. Whilst this means that retrieval
> isn't optimal, especially if there are multiple nexthops, this avoids
> the memory cost of an extra pointer, as well as any potential change
> to the cache or instruction layout that could cause a performance
> impact.
>
> Currently, the dst structure allocated to represent the destination of
> the packet and used for retrieving the encap by the encap-type
> interface has been grown through the addition of the rt_encap_len and
> rt_encap fields. This isn't desirable and could be fixed by defining a
> new destination type with operations copied from the normal case,
> other than the addition of the get_encap operation.
>
> Signed-off-by: Robert Shearman <rshearma@brocade.com>
> ---
>   include/net/ip_fib.h     |   2 +
>   include/net/route.h      |   3 +
>   net/ipv4/fib_frontend.c  |   3 +
>   net/ipv4/fib_lookup.h    |   2 +
>   net/ipv4/fib_semantics.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++-
>   net/ipv4/route.c         |  24 +++++++
>   6 files changed, 211 insertions(+), 2 deletions(-)
>
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 54271ed0ed45..a06cec5eb3aa 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -44,6 +44,7 @@ struct fib_config {
>   	u32			fc_flow;
>   	u32			fc_nlflags;
>   	struct nl_info		fc_nlinfo;
> +	struct nlattr *fc_encap;
>    };
>   
>   struct fib_info;
> @@ -75,6 +76,7 @@ struct fib_nh {
>   	struct fib_info		*nh_parent;
>   	unsigned int		nh_flags;
>   	unsigned char		nh_scope;
> +	unsigned char		nh_encap_len;
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>   	int			nh_weight;
>   	int			nh_power;
> diff --git a/include/net/route.h b/include/net/route.h
> index fe22d03afb6a..e8b58914c4c1 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -64,6 +64,9 @@ struct rtable {
>   	/* Miscellaneous cached information */
>   	u32			rt_pmtu;
>   
> +	unsigned int		rt_encap_len;
> +	void			*rt_encap;
> +
>   	struct list_head	rt_uncached;
>   	struct uncached_list	*rt_uncached_list;
>   };
> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> index 872494e6e6eb..aa538ab7e3b9 100644
> --- a/net/ipv4/fib_frontend.c
> +++ b/net/ipv4/fib_frontend.c
> @@ -656,6 +656,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
>   		case RTA_TABLE:
>   			cfg->fc_table = nla_get_u32(attr);
>   			break;
> +		case RTA_ENCAP:
> +			cfg->fc_encap = attr;
> +			break;
>   		}
>   	}
>   
> diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
> index c6211ed60b03..003318c51ae8 100644
> --- a/net/ipv4/fib_lookup.h
> +++ b/net/ipv4/fib_lookup.h
> @@ -34,6 +34,8 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
>   		  unsigned int);
>   void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
>   	       u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
> +const void *fib_get_nh_encap(const struct fib_info *fi,
> +			     const struct fib_nh *nh);
>   
>   static inline void fib_result_assign(struct fib_result *res,
>   				     struct fib_info *fi)
> diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
> index 28ec3c1823bf..db466b636241 100644
> --- a/net/ipv4/fib_semantics.c
> +++ b/net/ipv4/fib_semantics.c
> @@ -257,6 +257,9 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
>   	const struct fib_nh *onh = ofi->fib_nh;
>   
>   	for_nexthops(fi) {
> +		const void *onh_encap = fib_get_nh_encap(fi, nh);
> +		const void *nh_encap = fib_get_nh_encap(fi, nh);
> +
>   		if (nh->nh_oif != onh->nh_oif ||
>   		    nh->nh_gw  != onh->nh_gw ||
>   		    nh->nh_scope != onh->nh_scope ||
> @@ -266,7 +269,10 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
>   #ifdef CONFIG_IP_ROUTE_CLASSID
>   		    nh->nh_tclassid != onh->nh_tclassid ||
>   #endif
> -		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
> +		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD) ||
> +		    nh->nh_encap_len != onh->nh_encap_len ||
> +		    memcmp(nh_encap, onh_encap, nh->nh_encap_len)
> +			)
>   			return -1;
>   		onh++;
>   	} endfor_nexthops(fi);
> @@ -374,6 +380,11 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
>   		/* may contain flow and gateway attribute */
>   		nhsize += 2 * nla_total_size(4);
>   
> +		for_nexthops(fi) {
> +			if (nh->nh_encap_len)
> +				nhsize += nla_total_size(nh->nh_encap_len);
> +		} endfor_nexthops(fi);
> +
>   		/* all nexthops are packed in a nested attribute */
>   		payload += nla_total_size(fi->fib_nhs * nhsize);
>   	}
> @@ -434,6 +445,83 @@ static int fib_detect_death(struct fib_info *fi, int order,
>   	return 1;
>   }
>   
> +static int fib_total_encap(struct fib_config *cfg)
> +{
> +	struct net *net = cfg->fc_nlinfo.nl_net;
> +	int total_encap_len = 0;
> +
> +	if (cfg->fc_mp) {
> +		int remaining = cfg->fc_mp_len;
> +		struct rtnexthop *rtnh = cfg->fc_mp;
> +
> +		while (rtnh_ok(rtnh, remaining)) {
> +			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
> +			int attrlen;
> +
> +			attrlen = rtnh_attrlen(rtnh);
> +			nla = nla_find(attrs, attrlen, RTA_ENCAP);
> +			if (nla) {
> +				struct net_device *dev;
> +				int len;
> +
> +				dev = __dev_get_by_index(net,
> +							 rtnh->rtnh_ifindex);
> +				if (!dev)
> +					return -EINVAL;
> +
> +				/* Determine space required */
> +				len = rtnl_parse_encap(dev, nla, NULL);
> +				if (len < 0)
> +					return len;
> +
> +				total_encap_len += len;
> +			}
> +
> +			rtnh = rtnh_next(rtnh, &remaining);
> +		}
> +	} else {
> +		if (cfg->fc_encap) {
> +			struct net_device *dev;
> +			int len;
> +
> +			dev = __dev_get_by_index(net, cfg->fc_oif);
> +			if (!dev)
> +				return -EINVAL;
> +
> +			/* Determine space required */
> +			len = rtnl_parse_encap(dev, cfg->fc_encap, NULL);
> +			if (len < 0)
> +				return len;
> +
> +			total_encap_len += len;
> +		}
> +	}
> +
> +	return total_encap_len;
> +}
we could avoid parsing and finding this device twice, if fib_nh just 
held a pointer to the encap_info
(or tunnel info) ?. And the encap_info/tun_info could be refcounted and 
shared between
nexthops ?. In my implementation i have just a pointer to parsed encap state
in fib_nh

> +
> +static void *__fib_get_nh_encap(const struct fib_info *fi,
> +				const struct fib_nh *the_nh)
> +{
> +	char *cur_encap_ptr = (char *)(fi->fib_nh + fi->fib_nhs);
> +
> +	for_nexthops(fi) {
> +		if (nh == the_nh)
> +			return cur_encap_ptr;
> +		cur_encap_ptr += nh->nh_encap_len;
> +	} endfor_nexthops(fi);
> +
> +	return NULL;
> +}
> +
> +const void *fib_get_nh_encap(const struct fib_info *fi, const struct fib_nh *nh)
> +{
> +	if (!nh->nh_encap_len)
> +		return NULL;
> +
> +	return __fib_get_nh_encap(fi, nh);
> +}
> +
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>   
>   static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
> @@ -475,6 +563,26 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
>   			if (nexthop_nh->nh_tclassid)
>   				fi->fib_net->ipv4.fib_num_tclassid_users++;
>   #endif
> +			nla = nla_find(attrs, attrlen, RTA_ENCAP);
> +			if (nla) {
> +				struct net *net = cfg->fc_nlinfo.nl_net;
> +				struct net_device *dev;
> +				void *nh_encap;
> +				int len;
> +
> +				dev = __dev_get_by_index(net,
> +							 nexthop_nh->nh_oif);
> +				if (!dev)
> +					return -EINVAL;
> +
> +				nh_encap = __fib_get_nh_encap(fi, nexthop_nh);
> +
> +				/* Fill in nh encap */
> +				len = rtnl_parse_encap(dev, nla, nh_encap);
> +				if (len < 0)
> +					return len;
> +				nexthop_nh->nh_encap_len = len;
> +			}
>   		}
>   
>   		rtnh = rtnh_next(rtnh, &remaining);
> @@ -495,6 +603,17 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
>   	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
>   		return 1;
>   
> +	if (cfg->fc_encap) {
> +		const void *nh_encap = fib_get_nh_encap(fi, fi->fib_nh);
> +
> +		if (!fi->fib_nh->nh_oif ||
> +		    rtnl_match_encap(fi->fib_nh->nh_dev,
> +				     cfg->fc_encap,
> +				     fi->fib_nh->nh_encap_len,
> +				     nh_encap))
> +			return 1;
> +	}
> +
>   	if (cfg->fc_oif || cfg->fc_gw) {
>   		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
>   		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
> @@ -530,6 +649,17 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
>   			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
>   				return 1;
>   #endif
> +			nla = nla_find(attrs, attrlen, RTA_ENCAP);
> +			if (nla) {
> +				const void *nh_encap = fib_get_nh_encap(fi, nh);
> +
> +				if (!nh->nh_oif ||
> +				    rtnl_match_encap(nh->nh_dev,
> +						     cfg->fc_encap,
> +						     nh->nh_encap_len,
> +						     nh_encap))
> +					return 1;
> +			}
>   		}
>   
>   		rtnh = rtnh_next(rtnh, &remaining);
> @@ -760,6 +890,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
>   	struct fib_info *ofi;
>   	int nhs = 1;
>   	struct net *net = cfg->fc_nlinfo.nl_net;
> +	int encap_len;
>   
>   	if (cfg->fc_type > RTN_MAX)
>   		goto err_inval;
> @@ -798,7 +929,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
>   			goto failure;
>   	}
>   
> -	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
> +	encap_len = fib_total_encap(cfg);
> +	if (encap_len < 0) {
> +		err = encap_len;
> +		goto failure;
> +	}
> +
> +	fi = kzalloc(sizeof(*fi) + nhs * sizeof(struct fib_nh) + encap_len,
> +		     GFP_KERNEL);
>   	if (!fi)
>   		goto failure;
>   	fib_info_cnt++;
> @@ -886,6 +1024,26 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>   		nh->nh_weight = 1;
>   #endif
> +		if (cfg->fc_encap) {
> +			struct net_device *dev;
> +			void *nh_encap;
> +			int len;
> +
> +			err = -EINVAL;
> +			dev = __dev_get_by_index(net, nh->nh_oif);
> +			if (!dev)
> +				goto failure;
> +
> +			nh_encap = __fib_get_nh_encap(fi, nh);
> +
> +			/* Fill in nh encap */
> +			len = rtnl_parse_encap(dev, cfg->fc_encap, nh_encap);
> +			if (len < 0 || len > sizeof(nh->nh_encap_len) * 8) {
> +				err = len;
> +				goto failure;
> +			}
> +			nh->nh_encap_len = len;
> +		}
>   	}
>   
>   	if (fib_props[cfg->fc_type].error) {
> @@ -1023,6 +1181,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
>   	    nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
>   		goto nla_put_failure;
>   	if (fi->fib_nhs == 1) {
> +		const void *nh_encap;
> +
>   		if (fi->fib_nh->nh_gw &&
>   		    nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
>   			goto nla_put_failure;
> @@ -1034,6 +1194,12 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
>   		    nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
>   			goto nla_put_failure;
>   #endif
> +
> +		nh_encap = fib_get_nh_encap(fi, &fi->fib_nh[0]);
> +		if (nh_encap && rtnl_fill_encap(fi->fib_nh[0].nh_dev, skb,
> +						fi->fib_nh[0].nh_encap_len,
> +						nh_encap))
> +			goto nla_put_failure;
>   	}
>   #ifdef CONFIG_IP_ROUTE_MULTIPATH
>   	if (fi->fib_nhs > 1) {
> @@ -1045,6 +1211,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
>   			goto nla_put_failure;
>   
>   		for_nexthops(fi) {
> +			const void *nh_encap;
> +
>   			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
>   			if (!rtnh)
>   				goto nla_put_failure;
> @@ -1061,6 +1229,13 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
>   			    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
>   				goto nla_put_failure;
>   #endif
> +
> +			nh_encap = fib_get_nh_encap(fi, nh);
> +			if (nh_encap && rtnl_fill_encap(nh->nh_dev, skb,
> +							nh->nh_encap_len,
> +							nh_encap))
> +				goto nla_put_failure;
> +
>   			/* length of rtnetlink header + attributes */
>   			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
>   		} endfor_nexthops(fi);
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index f6055984c307..d52fa3d168a5 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -110,6 +110,8 @@
>   #endif
>   #include <net/secure_seq.h>
>   
> +#include "fib_lookup.h"
> +
>   #define RT_FL_TOS(oldflp4) \
>   	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
>   
> @@ -138,6 +140,8 @@ static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
>   					   struct sk_buff *skb, u32 mtu);
>   static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
>   					struct sk_buff *skb);
> +static int		ipv4_dst_get_encap(const struct dst_entry *dst,
> +					   const void **encap);
>   static void		ipv4_dst_destroy(struct dst_entry *dst);
>   
>   static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
> @@ -163,6 +167,7 @@ static struct dst_ops ipv4_dst_ops = {
>   	.redirect =		ip_do_redirect,
>   	.local_out =		__ip_local_out,
>   	.neigh_lookup =		ipv4_neigh_lookup,
> +	.get_encap =		ipv4_dst_get_encap,
>   };
>   
>   #define ECN_OR_COST(class)	TC_PRIO_##class
> @@ -1145,6 +1150,15 @@ static void ipv4_link_failure(struct sk_buff *skb)
>   		dst_set_expires(&rt->dst, 0);
>   }
>   
> +static int ipv4_dst_get_encap(const struct dst_entry *dst,
> +			      const void **encap)
> +{
> +	const struct rtable *rt = (const struct rtable *)dst;
> +
> +	*encap = rt->rt_encap;
> +	return rt->rt_encap_len;
> +}
> +
>   static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
>   {
>   	pr_debug("%s: %pI4 -> %pI4, %s\n",
> @@ -1394,6 +1408,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
>   
>   	if (fi) {
>   		struct fib_nh *nh = &FIB_RES_NH(*res);
> +		const void *nh_encap;
>   
>   		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
>   			rt->rt_gateway = nh->nh_gw;
> @@ -1403,6 +1418,15 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
>   #ifdef CONFIG_IP_ROUTE_CLASSID
>   		rt->dst.tclassid = nh->nh_tclassid;
>   #endif
> +
> +		nh_encap = fib_get_nh_encap(fi, nh);
> +		if (unlikely(nh_encap)) {
> +			rt->rt_encap = kmemdup(nh_encap, nh->nh_encap_len,
> +					       GFP_KERNEL);
> +			if (rt->rt_encap)
> +				rt->rt_encap_len = nh->nh_encap_len;
> +		}
> +

And..., you could make the rtable point to the same encap info.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Robert Shearman June 2, 2015, 4:35 p.m. UTC | #2
On 02/06/15 17:01, roopa wrote:
> On 6/1/15, 9:46 AM, Robert Shearman wrote:
>> Parse RTA_ENCAP attribute for one path and multipath routes. The encap
>> length is stored in a newly added field to fib_nh, nh_encap_len,
>> although this is added to a padding hole in the structure so that it
>> doesn't increase the size at all. The encap data itself is stored at
>> the end of the array of nexthops. Whilst this means that retrieval
>> isn't optimal, especially if there are multiple nexthops, this avoids
>> the memory cost of an extra pointer, as well as any potential change
>> to the cache or instruction layout that could cause a performance
>> impact.
>>
>> Currently, the dst structure allocated to represent the destination of
>> the packet and used for retrieving the encap by the encap-type
>> interface has been grown through the addition of the rt_encap_len and
>> rt_encap fields. This isn't desirable and could be fixed by defining a
>> new destination type with operations copied from the normal case,
>> other than the addition of the get_encap operation.
>>
>> Signed-off-by: Robert Shearman <rshearma@brocade.com>
...
>> @@ -434,6 +445,83 @@ static int fib_detect_death(struct fib_info *fi,
>> int order,
>>       return 1;
>>   }
>> +static int fib_total_encap(struct fib_config *cfg)
>> +{
>> +    struct net *net = cfg->fc_nlinfo.nl_net;
>> +    int total_encap_len = 0;
>> +
>> +    if (cfg->fc_mp) {
>> +        int remaining = cfg->fc_mp_len;
>> +        struct rtnexthop *rtnh = cfg->fc_mp;
>> +
>> +        while (rtnh_ok(rtnh, remaining)) {
>> +            struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
>> +            int attrlen;
>> +
>> +            attrlen = rtnh_attrlen(rtnh);
>> +            nla = nla_find(attrs, attrlen, RTA_ENCAP);
>> +            if (nla) {
>> +                struct net_device *dev;
>> +                int len;
>> +
>> +                dev = __dev_get_by_index(net,
>> +                             rtnh->rtnh_ifindex);
>> +                if (!dev)
>> +                    return -EINVAL;
>> +
>> +                /* Determine space required */
>> +                len = rtnl_parse_encap(dev, nla, NULL);
>> +                if (len < 0)
>> +                    return len;
>> +
>> +                total_encap_len += len;
>> +            }
>> +
>> +            rtnh = rtnh_next(rtnh, &remaining);
>> +        }
>> +    } else {
>> +        if (cfg->fc_encap) {
>> +            struct net_device *dev;
>> +            int len;
>> +
>> +            dev = __dev_get_by_index(net, cfg->fc_oif);
>> +            if (!dev)
>> +                return -EINVAL;
>> +
>> +            /* Determine space required */
>> +            len = rtnl_parse_encap(dev, cfg->fc_encap, NULL);
>> +            if (len < 0)
>> +                return len;
>> +
>> +            total_encap_len += len;
>> +        }
>> +    }
>> +
>> +    return total_encap_len;
>> +}
> we could avoid parsing and finding this device twice, if fib_nh just
> held a pointer to the encap_info
> (or tunnel info) ?. And the encap_info/tun_info could be refcounted and
> shared between
> nexthops ?. In my implementation i have just a pointer to parsed encap
> state
> in fib_nh

Right - I took the approach here to avoid any extra memory use if encap 
isn't in use for the nexthop/route, and to avoid any potential 
performance penalty caused by extra cache misses. It would certainly 
make things simpler if those weren't concerns. I'd appreciate input from 
the community on this.

>> @@ -1403,6 +1418,15 @@ static void rt_set_nexthop(struct rtable *rt,
>> __be32 daddr,
>>   #ifdef CONFIG_IP_ROUTE_CLASSID
>>           rt->dst.tclassid = nh->nh_tclassid;
>>   #endif
>> +
>> +        nh_encap = fib_get_nh_encap(fi, nh);
>> +        if (unlikely(nh_encap)) {
>> +            rt->rt_encap = kmemdup(nh_encap, nh->nh_encap_len,
>> +                           GFP_KERNEL);
>> +            if (rt->rt_encap)
>> +                rt->rt_encap_len = nh->nh_encap_len;
>> +        }
>> +
>
> And..., you could make the rtable point to the same encap info.

Ack.

Thanks,
Rob
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 54271ed0ed45..a06cec5eb3aa 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -44,6 +44,7 @@  struct fib_config {
 	u32			fc_flow;
 	u32			fc_nlflags;
 	struct nl_info		fc_nlinfo;
+	struct nlattr *fc_encap;
  };
 
 struct fib_info;
@@ -75,6 +76,7 @@  struct fib_nh {
 	struct fib_info		*nh_parent;
 	unsigned int		nh_flags;
 	unsigned char		nh_scope;
+	unsigned char		nh_encap_len;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int			nh_weight;
 	int			nh_power;
diff --git a/include/net/route.h b/include/net/route.h
index fe22d03afb6a..e8b58914c4c1 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -64,6 +64,9 @@  struct rtable {
 	/* Miscellaneous cached information */
 	u32			rt_pmtu;
 
+	unsigned int		rt_encap_len;
+	void			*rt_encap;
+
 	struct list_head	rt_uncached;
 	struct uncached_list	*rt_uncached_list;
 };
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 872494e6e6eb..aa538ab7e3b9 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -656,6 +656,9 @@  static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 		case RTA_TABLE:
 			cfg->fc_table = nla_get_u32(attr);
 			break;
+		case RTA_ENCAP:
+			cfg->fc_encap = attr;
+			break;
 		}
 	}
 
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c6211ed60b03..003318c51ae8 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -34,6 +34,8 @@  int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
 		  unsigned int);
 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
 	       u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
+const void *fib_get_nh_encap(const struct fib_info *fi,
+			     const struct fib_nh *nh);
 
 static inline void fib_result_assign(struct fib_result *res,
 				     struct fib_info *fi)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 28ec3c1823bf..db466b636241 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -257,6 +257,9 @@  static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 	const struct fib_nh *onh = ofi->fib_nh;
 
 	for_nexthops(fi) {
+		const void *onh_encap = fib_get_nh_encap(fi, nh);
+		const void *nh_encap = fib_get_nh_encap(fi, nh);
+
 		if (nh->nh_oif != onh->nh_oif ||
 		    nh->nh_gw  != onh->nh_gw ||
 		    nh->nh_scope != onh->nh_scope ||
@@ -266,7 +269,10 @@  static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		    nh->nh_tclassid != onh->nh_tclassid ||
 #endif
-		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
+		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD) ||
+		    nh->nh_encap_len != onh->nh_encap_len ||
+		    memcmp(nh_encap, onh_encap, nh->nh_encap_len)
+			)
 			return -1;
 		onh++;
 	} endfor_nexthops(fi);
@@ -374,6 +380,11 @@  static inline size_t fib_nlmsg_size(struct fib_info *fi)
 		/* may contain flow and gateway attribute */
 		nhsize += 2 * nla_total_size(4);
 
+		for_nexthops(fi) {
+			if (nh->nh_encap_len)
+				nhsize += nla_total_size(nh->nh_encap_len);
+		} endfor_nexthops(fi);
+
 		/* all nexthops are packed in a nested attribute */
 		payload += nla_total_size(fi->fib_nhs * nhsize);
 	}
@@ -434,6 +445,83 @@  static int fib_detect_death(struct fib_info *fi, int order,
 	return 1;
 }
 
+static int fib_total_encap(struct fib_config *cfg)
+{
+	struct net *net = cfg->fc_nlinfo.nl_net;
+	int total_encap_len = 0;
+
+	if (cfg->fc_mp) {
+		int remaining = cfg->fc_mp_len;
+		struct rtnexthop *rtnh = cfg->fc_mp;
+
+		while (rtnh_ok(rtnh, remaining)) {
+			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+			int attrlen;
+
+			attrlen = rtnh_attrlen(rtnh);
+			nla = nla_find(attrs, attrlen, RTA_ENCAP);
+			if (nla) {
+				struct net_device *dev;
+				int len;
+
+				dev = __dev_get_by_index(net,
+							 rtnh->rtnh_ifindex);
+				if (!dev)
+					return -EINVAL;
+
+				/* Determine space required */
+				len = rtnl_parse_encap(dev, nla, NULL);
+				if (len < 0)
+					return len;
+
+				total_encap_len += len;
+			}
+
+			rtnh = rtnh_next(rtnh, &remaining);
+		}
+	} else {
+		if (cfg->fc_encap) {
+			struct net_device *dev;
+			int len;
+
+			dev = __dev_get_by_index(net, cfg->fc_oif);
+			if (!dev)
+				return -EINVAL;
+
+			/* Determine space required */
+			len = rtnl_parse_encap(dev, cfg->fc_encap, NULL);
+			if (len < 0)
+				return len;
+
+			total_encap_len += len;
+		}
+	}
+
+	return total_encap_len;
+}
+
+static void *__fib_get_nh_encap(const struct fib_info *fi,
+				const struct fib_nh *the_nh)
+{
+	char *cur_encap_ptr = (char *)(fi->fib_nh + fi->fib_nhs);
+
+	for_nexthops(fi) {
+		if (nh == the_nh)
+			return cur_encap_ptr;
+		cur_encap_ptr += nh->nh_encap_len;
+	} endfor_nexthops(fi);
+
+	return NULL;
+}
+
+const void *fib_get_nh_encap(const struct fib_info *fi, const struct fib_nh *nh)
+{
+	if (!nh->nh_encap_len)
+		return NULL;
+
+	return __fib_get_nh_encap(fi, nh);
+}
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 
 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
@@ -475,6 +563,26 @@  static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 			if (nexthop_nh->nh_tclassid)
 				fi->fib_net->ipv4.fib_num_tclassid_users++;
 #endif
+			nla = nla_find(attrs, attrlen, RTA_ENCAP);
+			if (nla) {
+				struct net *net = cfg->fc_nlinfo.nl_net;
+				struct net_device *dev;
+				void *nh_encap;
+				int len;
+
+				dev = __dev_get_by_index(net,
+							 nexthop_nh->nh_oif);
+				if (!dev)
+					return -EINVAL;
+
+				nh_encap = __fib_get_nh_encap(fi, nexthop_nh);
+
+				/* Fill in nh encap */
+				len = rtnl_parse_encap(dev, nla, nh_encap);
+				if (len < 0)
+					return len;
+				nexthop_nh->nh_encap_len = len;
+			}
 		}
 
 		rtnh = rtnh_next(rtnh, &remaining);
@@ -495,6 +603,17 @@  int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
 		return 1;
 
+	if (cfg->fc_encap) {
+		const void *nh_encap = fib_get_nh_encap(fi, fi->fib_nh);
+
+		if (!fi->fib_nh->nh_oif ||
+		    rtnl_match_encap(fi->fib_nh->nh_dev,
+				     cfg->fc_encap,
+				     fi->fib_nh->nh_encap_len,
+				     nh_encap))
+			return 1;
+	}
+
 	if (cfg->fc_oif || cfg->fc_gw) {
 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
 		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
@@ -530,6 +649,17 @@  int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
 				return 1;
 #endif
+			nla = nla_find(attrs, attrlen, RTA_ENCAP);
+			if (nla) {
+				const void *nh_encap = fib_get_nh_encap(fi, nh);
+
+				if (!nh->nh_oif ||
+				    rtnl_match_encap(nh->nh_dev,
+						     cfg->fc_encap,
+						     nh->nh_encap_len,
+						     nh_encap))
+					return 1;
+			}
 		}
 
 		rtnh = rtnh_next(rtnh, &remaining);
@@ -760,6 +890,7 @@  struct fib_info *fib_create_info(struct fib_config *cfg)
 	struct fib_info *ofi;
 	int nhs = 1;
 	struct net *net = cfg->fc_nlinfo.nl_net;
+	int encap_len;
 
 	if (cfg->fc_type > RTN_MAX)
 		goto err_inval;
@@ -798,7 +929,14 @@  struct fib_info *fib_create_info(struct fib_config *cfg)
 			goto failure;
 	}
 
-	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
+	encap_len = fib_total_encap(cfg);
+	if (encap_len < 0) {
+		err = encap_len;
+		goto failure;
+	}
+
+	fi = kzalloc(sizeof(*fi) + nhs * sizeof(struct fib_nh) + encap_len,
+		     GFP_KERNEL);
 	if (!fi)
 		goto failure;
 	fib_info_cnt++;
@@ -886,6 +1024,26 @@  struct fib_info *fib_create_info(struct fib_config *cfg)
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 		nh->nh_weight = 1;
 #endif
+		if (cfg->fc_encap) {
+			struct net_device *dev;
+			void *nh_encap;
+			int len;
+
+			err = -EINVAL;
+			dev = __dev_get_by_index(net, nh->nh_oif);
+			if (!dev)
+				goto failure;
+
+			nh_encap = __fib_get_nh_encap(fi, nh);
+
+			/* Fill in nh encap */
+			len = rtnl_parse_encap(dev, cfg->fc_encap, nh_encap);
+			if (len < 0 || len > sizeof(nh->nh_encap_len) * 8) {
+				err = len;
+				goto failure;
+			}
+			nh->nh_encap_len = len;
+		}
 	}
 
 	if (fib_props[cfg->fc_type].error) {
@@ -1023,6 +1181,8 @@  int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 	    nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
 		goto nla_put_failure;
 	if (fi->fib_nhs == 1) {
+		const void *nh_encap;
+
 		if (fi->fib_nh->nh_gw &&
 		    nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
 			goto nla_put_failure;
@@ -1034,6 +1194,12 @@  int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		    nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
 			goto nla_put_failure;
 #endif
+
+		nh_encap = fib_get_nh_encap(fi, &fi->fib_nh[0]);
+		if (nh_encap && rtnl_fill_encap(fi->fib_nh[0].nh_dev, skb,
+						fi->fib_nh[0].nh_encap_len,
+						nh_encap))
+			goto nla_put_failure;
 	}
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (fi->fib_nhs > 1) {
@@ -1045,6 +1211,8 @@  int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 			goto nla_put_failure;
 
 		for_nexthops(fi) {
+			const void *nh_encap;
+
 			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
 			if (!rtnh)
 				goto nla_put_failure;
@@ -1061,6 +1229,13 @@  int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 			    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
 				goto nla_put_failure;
 #endif
+
+			nh_encap = fib_get_nh_encap(fi, nh);
+			if (nh_encap && rtnl_fill_encap(nh->nh_dev, skb,
+							nh->nh_encap_len,
+							nh_encap))
+				goto nla_put_failure;
+
 			/* length of rtnetlink header + attributes */
 			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
 		} endfor_nexthops(fi);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f6055984c307..d52fa3d168a5 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -110,6 +110,8 @@ 
 #endif
 #include <net/secure_seq.h>
 
+#include "fib_lookup.h"
+
 #define RT_FL_TOS(oldflp4) \
 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 
@@ -138,6 +140,8 @@  static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 					   struct sk_buff *skb, u32 mtu);
 static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 					struct sk_buff *skb);
+static int		ipv4_dst_get_encap(const struct dst_entry *dst,
+					   const void **encap);
 static void		ipv4_dst_destroy(struct dst_entry *dst);
 
 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
@@ -163,6 +167,7 @@  static struct dst_ops ipv4_dst_ops = {
 	.redirect =		ip_do_redirect,
 	.local_out =		__ip_local_out,
 	.neigh_lookup =		ipv4_neigh_lookup,
+	.get_encap =		ipv4_dst_get_encap,
 };
 
 #define ECN_OR_COST(class)	TC_PRIO_##class
@@ -1145,6 +1150,15 @@  static void ipv4_link_failure(struct sk_buff *skb)
 		dst_set_expires(&rt->dst, 0);
 }
 
+static int ipv4_dst_get_encap(const struct dst_entry *dst,
+			      const void **encap)
+{
+	const struct rtable *rt = (const struct rtable *)dst;
+
+	*encap = rt->rt_encap;
+	return rt->rt_encap_len;
+}
+
 static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
 {
 	pr_debug("%s: %pI4 -> %pI4, %s\n",
@@ -1394,6 +1408,7 @@  static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 
 	if (fi) {
 		struct fib_nh *nh = &FIB_RES_NH(*res);
+		const void *nh_encap;
 
 		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
 			rt->rt_gateway = nh->nh_gw;
@@ -1403,6 +1418,15 @@  static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
 #endif
+
+		nh_encap = fib_get_nh_encap(fi, nh);
+		if (unlikely(nh_encap)) {
+			rt->rt_encap = kmemdup(nh_encap, nh->nh_encap_len,
+					       GFP_KERNEL);
+			if (rt->rt_encap)
+				rt->rt_encap_len = nh->nh_encap_len;
+		}
+
 		if (unlikely(fnhe))
 			cached = rt_bind_exception(rt, fnhe, daddr);
 		else if (!(rt->dst.flags & DST_NOCACHE))