diff mbox

[net-next,v2,1/2] mpls: multipath support

Message ID 1444157209-12518-2-git-send-email-roopa@cumulusnetworks.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Roopa Prabhu Oct. 6, 2015, 6:46 p.m. UTC
From: Roopa Prabhu <roopa@cumulusnetworks.com>

This patch adds support for MPLS multipath routes.

Includes following changes to support multipath:
- splits struct mpls_route into 'struct mpls_route + struct mpls_nh'

- 'struct mpls_nh' represents a mpls nexthop label forwarding entry

- moves mpls route and nexthop structures into internal.h

- A mpls_route can point to multiple mpls_nh structs

- the nexthops are maintained as a list

- In the process of restructuring, this patch also consistently changes all
labels to u8

- Adds support to parse/fill RTA_MULTIPATH netlink attribute for
multipath routes similar to ipv4/v6 fib

- In this patch, the multipath route nexthop selection algorithm
is a simple round robin picked up from ipv4 fib code and is replaced by
a hash based algorithm from Robert Shearman in the next patch

- mpls_route_update cleanup: remove 'dev' handling in mpls_route_update.
mpls_route_update though implemented to update based on dev, it was never
used that way. And the dev handling gets tricky with multiple nexthops. Cannot
match against any single nexthops dev. So, this patch removes the unused
'dev' handling in mpls_route_update.

Example:

$ip -f mpls route add 100 nexthop as 200 via inet 10.1.1.2 dev swp1 \
                nexthop as 700 via inet 10.1.1.6 dev swp2 \
                nexthop as 800 via inet 40.1.1.2 dev swp3

$ip  -f mpls route show
100
        nexthop as to 200 via inet 10.1.1.2  dev swp1
        nexthop as to 700 via inet 10.1.1.6  dev swp2
        nexthop as to 800 via inet 40.1.1.2  dev swp3

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 include/net/mpls_iptunnel.h |   2 +-
 net/mpls/af_mpls.c          | 627 +++++++++++++++++++++++++++++++++-----------
 net/mpls/internal.h         |  43 ++-
 3 files changed, 516 insertions(+), 156 deletions(-)

Comments

Eric W. Biederman Oct. 6, 2015, 7:44 p.m. UTC | #1
Roopa Prabhu <roopa@cumulusnetworks.com> writes:

> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>
> This patch adds support for MPLS multipath routes.
>
> Includes following changes to support multipath:
> - splits struct mpls_route into 'struct mpls_route + struct mpls_nh'
>
> - 'struct mpls_nh' represents a mpls nexthop label forwarding entry
>
> - moves mpls route and nexthop structures into internal.h
>
> - A mpls_route can point to multiple mpls_nh structs
>
> - the nexthops are maintained as a list

So I am not certain I like nexthops being a list.  In the practical case
introducing this list guarantees that everyone will see at least an
extra cache line miss in the forwarding path.

In the more abstract sense a list is the wrong data structure.  If the
list is so short we can afford to walk it an array is a better data
structure.  If we need enough entries to make the memory consumption
of an array a concern we want some kind of hash table or tree data
structure, because a list will be too long in that case.

So can we please not use a list?

I expect we can simplify the data structures by noting that rt_via must
be an ethernet mac today so that 6 bytes are enough and 8 bytes gives us
a bit extra and aligns things nicely.

Also I know it goes away in the next patch but a spinlock taken for
every transit through the forwarding path really bugs me.

Eric

> - In the process of restructuring, this patch also consistently changes all
> labels to u8
>
> - Adds support to parse/fill RTA_MULTIPATH netlink attribute for
> multipath routes similar to ipv4/v6 fib
>
> - In this patch, the multipath route nexthop selection algorithm
> is a simple round robin picked up from ipv4 fib code and is replaced by
> a hash based algorithm from Robert Shearman in the next patch
>
> - mpls_route_update cleanup: remove 'dev' handling in mpls_route_update.
> mpls_route_update though implemented to update based on dev, it was never
> used that way. And the dev handling gets tricky with multiple nexthops. Cannot
> match against any single nexthops dev. So, this patch removes the unused
> 'dev' handling in mpls_route_update.

>
> Example:
>
> $ip -f mpls route add 100 nexthop as 200 via inet 10.1.1.2 dev swp1 \
>                 nexthop as 700 via inet 10.1.1.6 dev swp2 \
>                 nexthop as 800 via inet 40.1.1.2 dev swp3
>
> $ip  -f mpls route show
> 100
>         nexthop as to 200 via inet 10.1.1.2  dev swp1
>         nexthop as to 700 via inet 10.1.1.6  dev swp2
>         nexthop as to 800 via inet 40.1.1.2  dev swp3
>
> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
> ---
>  include/net/mpls_iptunnel.h |   2 +-
>  net/mpls/af_mpls.c          | 627 +++++++++++++++++++++++++++++++++-----------
>  net/mpls/internal.h         |  43 ++-
>  3 files changed, 516 insertions(+), 156 deletions(-)
>
> diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
> index 4757997..179253f 100644
> --- a/include/net/mpls_iptunnel.h
> +++ b/include/net/mpls_iptunnel.h
> @@ -18,7 +18,7 @@
>  
>  struct mpls_iptunnel_encap {
>  	u32	label[MAX_NEW_LABELS];
> -	u32	labels;
> +	u8	labels;
>  };
>  
>  static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate)
> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
> index 8c5707d..ae9e153 100644
> --- a/net/mpls/af_mpls.c
> +++ b/net/mpls/af_mpls.c
> @@ -19,39 +19,12 @@
>  #include <net/ipv6.h>
>  #include <net/addrconf.h>
>  #endif
> +#include <net/nexthop.h>
>  #include "internal.h"
>  
> -#define LABEL_NOT_SPECIFIED (1<<20)
> -#define MAX_NEW_LABELS 2
> -
> -/* This maximum ha length copied from the definition of struct neighbour */
> -#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
> -
> -enum mpls_payload_type {
> -	MPT_UNSPEC, /* IPv4 or IPv6 */
> -	MPT_IPV4 = 4,
> -	MPT_IPV6 = 6,
> -
> -	/* Other types not implemented:
> -	 *  - Pseudo-wire with or without control word (RFC4385)
> -	 *  - GAL (RFC5586)
> -	 */
> -};
> -
> -struct mpls_route { /* next hop label forwarding entry */
> -	struct net_device __rcu *rt_dev;
> -	struct rcu_head		rt_rcu;
> -	u32			rt_label[MAX_NEW_LABELS];
> -	u8			rt_protocol; /* routing protocol that set this entry */
> -	u8                      rt_payload_type;
> -	u8			rt_labels;
> -	u8			rt_via_alen;
> -	u8			rt_via_table;
> -	u8			rt_via[0];
> -};
> -
>  static int zero = 0;
>  static int label_limit = (1 << 20) - 1;
> +static DEFINE_SPINLOCK(mpls_multipath_lock);
>  
>  static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
>  		       struct nlmsghdr *nlh, struct net *net, u32 portid,
> @@ -80,10 +53,10 @@ bool mpls_output_possible(const struct net_device *dev)
>  }
>  EXPORT_SYMBOL_GPL(mpls_output_possible);
>  
> -static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
> +static unsigned int mpls_nh_header_size(const struct mpls_nh *nh)
>  {
>  	/* The size of the layer 2.5 labels to be added for this route */
> -	return rt->rt_labels * sizeof(struct mpls_shim_hdr);
> +	return nh->nh_labels * sizeof(struct mpls_shim_hdr);
>  }
>  
>  unsigned int mpls_dev_mtu(const struct net_device *dev)
> @@ -105,8 +78,58 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
>  }
>  EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
>  
> -static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
> -			struct mpls_entry_decoded dec)
> +/* This is a cut/copy/modify from fib_select_multipath */
> +static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt)
> +{
> +	struct mpls_nh *nh;
> +	struct mpls_nh *ret_nh;
> +	int nhsel = 0;
> +	int w;
> +
> +	spin_lock_bh(&mpls_multipath_lock);
> +	ret_nh = list_first_entry_or_null(&rt->rt_nhs, struct mpls_nh,
> +					  nh_next);
> +	if (rt->rt_power <= 0) {
> +		int power = 0;
> +
> +		list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
> +			power += nh->nh_weight;
> +			nh->nh_power = nh->nh_weight;
> +		}
> +		rt->rt_power = power;
> +		if (power <= 0) {
> +			spin_unlock_bh(&mpls_multipath_lock);
> +			/* Race condition: route has just become dead. */
> +			return ret_nh;
> +		}
> +	}
> +
> +	/* w should be random number [0..rt->rt_power-1],
> +	 * it is pretty bad approximation.
> +	 */
> +	w = jiffies % rt->rt_power;
> +
> +	list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
> +		if (nh->nh_power) {
> +			w -= nh->nh_power;
> +			if (w <= 0) {
> +				nh->nh_power--;
> +				rt->rt_power--;
> +				ret_nh = nh;
> +				spin_unlock_bh(&mpls_multipath_lock);
> +				return ret_nh;
> +			}
> +		}
> +		nhsel++;
> +	}
> +
> +	/* Race condition: route has just become dead. */
> +	spin_unlock_bh(&mpls_multipath_lock);
> +	return ret_nh;
> +}
> +
> +static bool mpls_egress(struct mpls_route *rt, struct mpls_nh *nh,
> +			struct sk_buff *skb, struct mpls_entry_decoded dec)
>  {
>  	enum mpls_payload_type payload_type;
>  	bool success = false;
> @@ -159,6 +182,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
>  	struct net *net = dev_net(dev);
>  	struct mpls_shim_hdr *hdr;
>  	struct mpls_route *rt;
> +	struct mpls_nh *nh;
>  	struct mpls_entry_decoded dec;
>  	struct net_device *out_dev;
>  	struct mpls_dev *mdev;
> @@ -196,9 +220,13 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
>  	if (!rt)
>  		goto drop;
>  
> +	nh = mpls_select_multipath(rt);
> +	if (!nh)
> +		goto drop;
> +
>  	/* Find the output device */
> -	out_dev = rcu_dereference(rt->rt_dev);
> -	if (!mpls_output_possible(out_dev))
> +	out_dev = rcu_dereference(nh->nh_dev);
> +	if (!out_dev || !mpls_output_possible(out_dev))
>  		goto drop;
>  
>  	if (skb_warn_if_lro(skb))
> @@ -212,7 +240,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
>  	dec.ttl -= 1;
>  
>  	/* Verify the destination can hold the packet */
> -	new_header_size = mpls_rt_header_size(rt);
> +	new_header_size = mpls_nh_header_size(nh);
>  	mtu = mpls_dev_mtu(out_dev);
>  	if (mpls_pkt_too_big(skb, mtu - new_header_size))
>  		goto drop;
> @@ -230,7 +258,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
>  
>  	if (unlikely(!new_header_size && dec.bos)) {
>  		/* Penultimate hop popping */
> -		if (!mpls_egress(rt, skb, dec))
> +		if (!mpls_egress(rt, nh, skb, dec))
>  			goto drop;
>  	} else {
>  		bool bos;
> @@ -240,13 +268,14 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
>  		/* Push the new labels */
>  		hdr = mpls_hdr(skb);
>  		bos = dec.bos;
> -		for (i = rt->rt_labels - 1; i >= 0; i--) {
> -			hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, bos);
> +		for (i = nh->nh_labels - 1; i >= 0; i--) {
> +			hdr[i] = mpls_entry_encode(nh->nh_label[i],
> +						   dec.ttl, 0, bos);
>  			bos = false;
>  		}
>  	}
>  
> -	err = neigh_xmit(rt->rt_via_table, out_dev, rt->rt_via, skb);
> +	err = neigh_xmit(nh->nh_via_table, out_dev, nh->nh_via, skb);
>  	if (err)
>  		net_dbg_ratelimited("%s: packet transmission failed: %d\n",
>  				    __func__, err);
> @@ -270,31 +299,43 @@ static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = {
>  struct mpls_route_config {
>  	u32			rc_protocol;
>  	u32			rc_ifindex;
> -	u16			rc_via_table;
> -	u16			rc_via_alen;
> +	u8			rc_via_table;
> +	u8			rc_via_alen;
>  	u8			rc_via[MAX_VIA_ALEN];
> +	u8			rc_output_labels;
>  	u32			rc_label;
> -	u32			rc_output_labels;
>  	u32			rc_output_label[MAX_NEW_LABELS];
>  	u32			rc_nlflags;
>  	enum mpls_payload_type	rc_payload_type;
>  	struct nl_info		rc_nlinfo;
> +	struct rtnexthop	*rc_mp;
> +	int			rc_mp_len;
>  };
>  
> -static struct mpls_route *mpls_rt_alloc(size_t alen)
> +static struct mpls_route *mpls_rt_alloc(int num_nh)
>  {
>  	struct mpls_route *rt;
>  
> -	rt = kzalloc(sizeof(*rt) + alen, GFP_KERNEL);
> -	if (rt)
> -		rt->rt_via_alen = alen;
> +	rt = kzalloc(sizeof(*rt), GFP_KERNEL);
> +	if (rt) {
> +		rt->rt_nhn = num_nh;
> +		INIT_LIST_HEAD(&rt->rt_nhs);
> +	}
> +
>  	return rt;
>  }
>  
>  static void mpls_rt_free(struct mpls_route *rt)
>  {
> -	if (rt)
> +	struct mpls_nh *nh, *nh_safe;
> +
> +	if (rt) {
> +		list_for_each_entry_safe(nh, nh_safe, &rt->rt_nhs, nh_next) {
> +			list_del(&nh->nh_next);
> +			kfree(nh);
> +		}
>  		kfree_rcu(rt, rt_rcu);
> +	}
>  }
>  
>  static void mpls_notify_route(struct net *net, unsigned index,
> @@ -312,25 +353,22 @@ static void mpls_notify_route(struct net *net, unsigned index,
>  }
>  
>  static void mpls_route_update(struct net *net, unsigned index,
> -			      struct net_device *dev, struct mpls_route *new,
> +			      struct mpls_route *new,
>  			      const struct nl_info *info)
>  {
>  	struct mpls_route __rcu **platform_label;
> -	struct mpls_route *rt, *old = NULL;
> +	struct mpls_route *rt;
>  
>  	ASSERT_RTNL();
>  
>  	platform_label = rtnl_dereference(net->mpls.platform_label);
>  	rt = rtnl_dereference(platform_label[index]);
> -	if (!dev || (rt && (rtnl_dereference(rt->rt_dev) == dev))) {
> -		rcu_assign_pointer(platform_label[index], new);
> -		old = rt;
> -	}
> +	rcu_assign_pointer(platform_label[index], new);
>  
> -	mpls_notify_route(net, index, old, new, info);
> +	mpls_notify_route(net, index, rt, new, info);
>  
>  	/* If we removed a route free it now */
> -	mpls_rt_free(old);
> +	mpls_rt_free(rt);
>  }
>  
>  static unsigned find_free_label(struct net *net)
> @@ -406,23 +444,23 @@ static struct net_device *inet6_fib_lookup_dev(struct net *net, void *addr)
>  #endif
>  
>  static struct net_device *find_outdev(struct net *net,
> -				      struct mpls_route_config *cfg)
> +				      struct mpls_nh *nh, int oif)
>  {
>  	struct net_device *dev = NULL;
>  
> -	if (!cfg->rc_ifindex) {
> -		switch (cfg->rc_via_table) {
> +	if (!oif) {
> +		switch (nh->nh_via_table) {
>  		case NEIGH_ARP_TABLE:
> -			dev = inet_fib_lookup_dev(net, cfg->rc_via);
> +			dev = inet_fib_lookup_dev(net, nh->nh_via);
>  			break;
>  		case NEIGH_ND_TABLE:
> -			dev = inet6_fib_lookup_dev(net, cfg->rc_via);
> +			dev = inet6_fib_lookup_dev(net, nh->nh_via);
>  			break;
>  		case NEIGH_LINK_TABLE:
>  			break;
>  		}
>  	} else {
> -		dev = dev_get_by_index(net, cfg->rc_ifindex);
> +		dev = dev_get_by_index(net, oif);
>  	}
>  
>  	if (!dev)
> @@ -431,15 +469,208 @@ static struct net_device *find_outdev(struct net *net,
>  	return dev;
>  }
>  
> +static int mpls_nh_assign_dev(struct net *net, struct mpls_nh *nh, int oif)
> +{
> +	struct net_device *dev = NULL;
> +	int err = -ENODEV;
> +
> +	dev = find_outdev(net, nh, oif);
> +	if (IS_ERR(dev)) {
> +		err = PTR_ERR(dev);
> +		dev = NULL;
> +		goto errout;
> +	}
> +
> +	/* Ensure this is a supported device */
> +	err = -EINVAL;
> +	if (!mpls_dev_get(dev))
> +		goto errout;
> +
> +	RCU_INIT_POINTER(nh->nh_dev, dev);
> +	dev_put(dev);
> +
> +	return 0;
> +
> +errout:
> +	if (dev)
> +		dev_put(dev);
> +	return err;
> +}
> +
> +static struct mpls_nh *mpls_nh_alloc(size_t alen)
> +{
> +	struct mpls_nh *nh;
> +
> +	nh = kzalloc(sizeof(*nh) + alen, GFP_KERNEL);
> +	if (nh)
> +		nh->nh_via_alen = alen;
> +
> +	return nh;
> +}
> +
> +static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg,
> +				  struct mpls_route *rt)
> +{
> +	struct net *net = cfg->rc_nlinfo.nl_net;
> +	struct mpls_nh *nh = NULL;
> +	int err;
> +	int i;
> +
> +	err = -EINVAL;
> +	/* Ensure only a supported number of labels are present */
> +	if (cfg->rc_output_labels > MAX_NEW_LABELS)
> +		goto errout;
> +
> +	err = -ENOMEM;
> +	nh = mpls_nh_alloc(cfg->rc_via_alen);
> +	if (!nh)
> +		goto errout;
> +
> +	nh->nh_labels = cfg->rc_output_labels;
> +	for (i = 0; i < nh->nh_labels; i++)
> +		nh->nh_label[i] = cfg->rc_output_label[i];
> +
> +	nh->nh_via_table = cfg->rc_via_table;
> +	memcpy(nh->nh_via, cfg->rc_via, cfg->rc_via_alen);
> +	nh->nh_via_alen = cfg->rc_via_alen;
> +
> +	err = mpls_nh_assign_dev(net, nh, cfg->rc_ifindex);
> +	if (err)
> +		goto errout;
> +
> +	list_add_tail(&nh->nh_next, &rt->rt_nhs);
> +
> +	return 0;
> +
> +errout:
> +	kfree(nh);
> +
> +	return err;
> +}
> +
> +static int mpls_nh_build(struct net *net, struct mpls_nh **rt_nh,
> +			 int oif, struct nlattr *via_attr,
> +			 struct nlattr *newdst)
> +{
> +	struct mpls_nh *nh = NULL;
> +	int err;
> +	u8 via_alen;
> +	u8 via_table;
> +	u8 via[MAX_VIA_ALEN];
> +
> +	err = nla_get_via(via_attr, &via_alen, &via_table,
> +			  via);
> +	if (err)
> +		goto errout;
> +
> +	nh = mpls_nh_alloc(via_alen);
> +	if (!nh)
> +		goto errout;
> +
> +	if (newdst) {
> +		err = nla_get_labels(newdst, MAX_NEW_LABELS,
> +				     &nh->nh_labels, nh->nh_label);
> +		if (err)
> +			goto errout;
> +	}
> +	nh->nh_via_table = via_table;
> +	memcpy(nh->nh_via, via, via_alen);
> +
> +	err = mpls_nh_assign_dev(net, nh, oif);
> +	if (err)
> +		goto errout;
> +
> +	*rt_nh = nh;
> +
> +	return 0;
> +
> +errout:
> +	kfree(nh);
> +
> +	return err;
> +}
> +
> +static int mpls_count_nexthops(struct rtnexthop *rtnh, int len)
> +{
> +	int nhs = 0;
> +	int remaining = len;
> +
> +	while (rtnh_ok(rtnh, remaining)) {
> +		nhs++;
> +		rtnh = rtnh_next(rtnh, &remaining);
> +	}
> +
> +	/* leftover implies invalid nexthop configuration, discard it */
> +	return remaining > 0 ? 0 : nhs;
> +}
> +
> +static int mpls_nh_build_multi(struct mpls_route_config *cfg,
> +			       struct mpls_route *rt)
> +{
> +	struct rtnexthop *rtnh = cfg->rc_mp;
> +	struct nlattr *nla_via, *nla_newdst;
> +	int remaining = cfg->rc_mp_len;
> +	struct mpls_nh *nh, *nh_safe;
> +	int nhs = 0;
> +	int err = 0;
> +
> +	while (rtnh_ok(rtnh, remaining)) {
> +		int attrlen;
> +
> +		nla_via = NULL;
> +		nla_newdst = NULL;
> +		nh = NULL;
> +
> +		err = -EINVAL;
> +		if (!rtnh_ok(rtnh, remaining))
> +			goto errout;
> +
> +		attrlen = rtnh_attrlen(rtnh);
> +		if (attrlen > 0) {
> +			struct nlattr *attrs = rtnh_attrs(rtnh);
> +
> +			nla_via = nla_find(attrs, attrlen, RTA_VIA);
> +			nla_newdst = nla_find(attrs, attrlen, RTA_NEWDST);
> +		}
> +
> +		err = -EINVAL;
> +		if (!nla_via)
> +			goto errout;
> +
> +		err = mpls_nh_build(cfg->rc_nlinfo.nl_net, &nh,
> +				    rtnh->rtnh_ifindex, nla_via,
> +				    nla_newdst);
> +		if (err)
> +			goto errout;
> +
> +		nh->nh_weight = rtnh->rtnh_hops + 1;
> +		list_add_tail(&nh->nh_next, &rt->rt_nhs);
> +
> +		rtnh = rtnh_next(rtnh, &remaining);
> +		nhs++;
> +	}
> +
> +	rt->rt_nhn = nhs;
> +
> +	return 0;
> +
> +errout:
> +	list_for_each_entry_safe(nh, nh_safe, &rt->rt_nhs, nh_next) {
> +		list_del(&nh->nh_next);
> +		kfree(nh);
> +	}
> +
> +	return err;
> +}
> +
>  static int mpls_route_add(struct mpls_route_config *cfg)
>  {
>  	struct mpls_route __rcu **platform_label;
>  	struct net *net = cfg->rc_nlinfo.nl_net;
> -	struct net_device *dev = NULL;
>  	struct mpls_route *rt, *old;
> -	unsigned index;
> -	int i;
>  	int err = -EINVAL;
> +	unsigned index;
> +	int nhs = 1; /* default to one nexthop */
>  
>  	index = cfg->rc_label;
>  
> @@ -457,27 +688,6 @@ static int mpls_route_add(struct mpls_route_config *cfg)
>  	if (index >= net->mpls.platform_labels)
>  		goto errout;
>  
> -	/* Ensure only a supported number of labels are present */
> -	if (cfg->rc_output_labels > MAX_NEW_LABELS)
> -		goto errout;
> -
> -	dev = find_outdev(net, cfg);
> -	if (IS_ERR(dev)) {
> -		err = PTR_ERR(dev);
> -		dev = NULL;
> -		goto errout;
> -	}
> -
> -	/* Ensure this is a supported device */
> -	err = -EINVAL;
> -	if (!mpls_dev_get(dev))
> -		goto errout;
> -
> -	err = -EINVAL;
> -	if ((cfg->rc_via_table == NEIGH_LINK_TABLE) &&
> -	    (dev->addr_len != cfg->rc_via_alen))
> -		goto errout;
> -
>  	/* Append makes no sense with mpls */
>  	err = -EOPNOTSUPP;
>  	if (cfg->rc_nlflags & NLM_F_APPEND)
> @@ -497,28 +707,34 @@ static int mpls_route_add(struct mpls_route_config *cfg)
>  	if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old)
>  		goto errout;
>  
> +	if (cfg->rc_mp) {
> +		err = -EINVAL;
> +		nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len);
> +		if (nhs == 0)
> +			goto errout;
> +	}
> +
>  	err = -ENOMEM;
> -	rt = mpls_rt_alloc(cfg->rc_via_alen);
> +	rt = mpls_rt_alloc(nhs);
>  	if (!rt)
>  		goto errout;
> -
> -	rt->rt_labels = cfg->rc_output_labels;
> -	for (i = 0; i < rt->rt_labels; i++)
> -		rt->rt_label[i] = cfg->rc_output_label[i];
>  	rt->rt_protocol = cfg->rc_protocol;
> -	RCU_INIT_POINTER(rt->rt_dev, dev);
>  	rt->rt_payload_type = cfg->rc_payload_type;
> -	rt->rt_via_table = cfg->rc_via_table;
> -	memcpy(rt->rt_via, cfg->rc_via, cfg->rc_via_alen);
>  
> -	mpls_route_update(net, index, NULL, rt, &cfg->rc_nlinfo);
> +	if (cfg->rc_mp)
> +		err = mpls_nh_build_multi(cfg, rt);
> +	else
> +		err = mpls_nh_build_from_cfg(cfg, rt);
> +	if (err)
> +		goto freert;
> +
> +	mpls_route_update(net, index, rt, &cfg->rc_nlinfo);
>  
> -	dev_put(dev);
>  	return 0;
>  
> +freert:
> +	mpls_rt_free(rt);
>  errout:
> -	if (dev)
> -		dev_put(dev);
>  	return err;
>  }
>  
> @@ -538,7 +754,7 @@ static int mpls_route_del(struct mpls_route_config *cfg)
>  	if (index >= net->mpls.platform_labels)
>  		goto errout;
>  
> -	mpls_route_update(net, index, NULL, NULL, &cfg->rc_nlinfo);
> +	mpls_route_update(net, index, NULL, &cfg->rc_nlinfo);
>  
>  	err = 0;
>  errout:
> @@ -628,6 +844,7 @@ static void mpls_ifdown(struct net_device *dev)
>  	struct mpls_route __rcu **platform_label;
>  	struct net *net = dev_net(dev);
>  	struct mpls_dev *mdev;
> +	struct mpls_nh *nh;
>  	unsigned index;
>  
>  	platform_label = rtnl_dereference(net->mpls.platform_label);
> @@ -635,9 +852,14 @@ static void mpls_ifdown(struct net_device *dev)
>  		struct mpls_route *rt = rtnl_dereference(platform_label[index]);
>  		if (!rt)
>  			continue;
> -		if (rtnl_dereference(rt->rt_dev) != dev)
> -			continue;
> -		rt->rt_dev = NULL;
> +		list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
> +			struct net_device *mdev;
> +
> +			mdev = rtnl_dereference(nh->nh_dev);
> +			if (mdev != dev)
> +				continue;
> +			nh->nh_dev = NULL;
> +		}
>  	}
>  
>  	mdev = mpls_dev_get(dev);
> @@ -736,7 +958,7 @@ int nla_put_labels(struct sk_buff *skb, int attrtype,
>  EXPORT_SYMBOL_GPL(nla_put_labels);
>  
>  int nla_get_labels(const struct nlattr *nla,
> -		   u32 max_labels, u32 *labels, u32 label[])
> +		   u8 max_labels, u8 *labels, u32 label[])
>  {
>  	unsigned len = nla_len(nla);
>  	unsigned nla_labels;
> @@ -781,6 +1003,48 @@ int nla_get_labels(const struct nlattr *nla,
>  }
>  EXPORT_SYMBOL_GPL(nla_get_labels);
>  
> +int nla_get_via(const struct nlattr *nla, u8 *via_alen,
> +		u8 *via_table, u8 via_addr[])
> +{
> +	struct rtvia *via = nla_data(nla);
> +	int err = -EINVAL;
> +	u8 alen;
> +
> +	if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr))
> +		goto errout;
> +	alen = nla_len(nla) -
> +			offsetof(struct rtvia, rtvia_addr);
> +	if (alen > MAX_VIA_ALEN)
> +		goto errout;
> +
> +	/* Validate the address family */
> +	switch (via->rtvia_family) {
> +	case AF_PACKET:
> +		*via_table = NEIGH_LINK_TABLE;
> +		break;
> +	case AF_INET:
> +		*via_table = NEIGH_ARP_TABLE;
> +		if (alen != 4)
> +			goto errout;
> +		break;
> +	case AF_INET6:
> +		*via_table = NEIGH_ND_TABLE;
> +		if (alen != 16)
> +			goto errout;
> +		break;
> +	default:
> +		/* Unsupported address family */
> +		goto errout;
> +	}
> +
> +	memcpy(via_addr, via->rtvia_addr, alen);
> +	*via_alen = alen;
> +	err = 0;
> +
> +errout:
> +	return err;
> +}
> +
>  static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
>  			       struct mpls_route_config *cfg)
>  {
> @@ -844,7 +1108,7 @@ static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
>  			break;
>  		case RTA_DST:
>  		{
> -			u32 label_count;
> +			u8 label_count;
>  			if (nla_get_labels(nla, 1, &label_count,
>  					   &cfg->rc_label))
>  				goto errout;
> @@ -857,35 +1121,15 @@ static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
>  		}
>  		case RTA_VIA:
>  		{
> -			struct rtvia *via = nla_data(nla);
> -			if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr))
> +			if (nla_get_via(nla, &cfg->rc_via_alen,
> +					&cfg->rc_via_table, cfg->rc_via))
>  				goto errout;
> -			cfg->rc_via_alen   = nla_len(nla) -
> -				offsetof(struct rtvia, rtvia_addr);
> -			if (cfg->rc_via_alen > MAX_VIA_ALEN)
> -				goto errout;
> -
> -			/* Validate the address family */
> -			switch(via->rtvia_family) {
> -			case AF_PACKET:
> -				cfg->rc_via_table = NEIGH_LINK_TABLE;
> -				break;
> -			case AF_INET:
> -				cfg->rc_via_table = NEIGH_ARP_TABLE;
> -				if (cfg->rc_via_alen != 4)
> -					goto errout;
> -				break;
> -			case AF_INET6:
> -				cfg->rc_via_table = NEIGH_ND_TABLE;
> -				if (cfg->rc_via_alen != 16)
> -					goto errout;
> -				break;
> -			default:
> -				/* Unsupported address family */
> -				goto errout;
> -			}
> -
> -			memcpy(cfg->rc_via, via->rtvia_addr, cfg->rc_via_alen);
> +			break;
> +		}
> +		case RTA_MULTIPATH:
> +		{
> +			cfg->rc_mp = nla_data(nla);
> +			cfg->rc_mp_len = nla_len(nla);
>  			break;
>  		}
>  		default:
> @@ -946,16 +1190,56 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
>  	rtm->rtm_type = RTN_UNICAST;
>  	rtm->rtm_flags = 0;
>  
> -	if (rt->rt_labels &&
> -	    nla_put_labels(skb, RTA_NEWDST, rt->rt_labels, rt->rt_label))
> -		goto nla_put_failure;
> -	if (nla_put_via(skb, rt->rt_via_table, rt->rt_via, rt->rt_via_alen))
> -		goto nla_put_failure;
> -	dev = rtnl_dereference(rt->rt_dev);
> -	if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
> -		goto nla_put_failure;
>  	if (nla_put_labels(skb, RTA_DST, 1, &label))
>  		goto nla_put_failure;
> +	if (rt->rt_nhn == 1) {
> +		struct mpls_nh *nh = list_first_entry_or_null(&rt->rt_nhs,
> +							struct mpls_nh,
> +							nh_next);
> +
> +		if (nh->nh_labels &&
> +		    nla_put_labels(skb, RTA_NEWDST, nh->nh_labels,
> +				   nh->nh_label))
> +			goto nla_put_failure;
> +		if (nla_put_via(skb, nh->nh_via_table, nh->nh_via,
> +				nh->nh_via_alen))
> +			goto nla_put_failure;
> +		dev = rtnl_dereference(nh->nh_dev);
> +		if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
> +			goto nla_put_failure;
> +	} else {
> +		struct rtnexthop *rtnh;
> +		struct nlattr *mp;
> +		struct mpls_nh *nh;
> +
> +		mp = nla_nest_start(skb, RTA_MULTIPATH);
> +		if (!mp)
> +			goto nla_put_failure;
> +
> +		list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
> +			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
> +			if (!rtnh)
> +				goto nla_put_failure;
> +
> +			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
> +			dev = rtnl_dereference(nh->nh_dev);
> +			if (dev)
> +				rtnh->rtnh_ifindex = dev->ifindex;
> +			if (nh->nh_labels &&
> +			    nla_put_labels(skb, RTA_NEWDST, nh->nh_labels,
> +					   nh->nh_label))
> +				goto nla_put_failure;
> +			if (nla_put_via(skb, nh->nh_via_table,
> +					nh->nh_via,
> +					nh->nh_via_alen))
> +				goto nla_put_failure;
> +
> +			/* length of rtnetlink header + attributes */
> +			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
> +		}
> +
> +		nla_nest_end(skb, mp);
> +	}
>  
>  	nlmsg_end(skb, nlh);
>  	return 0;
> @@ -1000,12 +1284,34 @@ static inline size_t lfib_nlmsg_size(struct mpls_route *rt)
>  {
>  	size_t payload =
>  		NLMSG_ALIGN(sizeof(struct rtmsg))
> -		+ nla_total_size(2 + rt->rt_via_alen)	/* RTA_VIA */
>  		+ nla_total_size(4);			/* RTA_DST */
> -	if (rt->rt_labels)				/* RTA_NEWDST */
> -		payload += nla_total_size(rt->rt_labels * 4);
> -	if (rt->rt_dev)					/* RTA_OIF */
> -		payload += nla_total_size(4);
> +
> +	if (rt->rt_nhn == 1) {
> +		struct mpls_nh *nh = list_first_entry_or_null(&rt->rt_nhs,
> +							      struct mpls_nh,
> +							      nh_next);
> +
> +		if (nh->nh_dev)
> +			payload += nla_total_size(4); /* RTA_OIF */
> +		payload += nla_total_size(2 + nh->nh_via_alen); /* RTA_VIA */
> +		if (nh->nh_labels) /* RTA_NEWDST */
> +			payload += nla_total_size(nh->nh_labels * 4);
> +	} else {
> +		struct mpls_nh *nh;
> +		/* each nexthop is packed in an attribute */
> +		size_t nhsize = 0;
> +
> +		list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
> +			nhsize += nla_total_size(sizeof(struct rtnexthop)) +
> +					nla_total_size(nh->nh_via_alen +
> +						       2); /* RTA_VIA */
> +			if (nh->nh_labels) /* RTA_NEWDST */
> +				nhsize += nla_total_size(nh->nh_labels * 4);
> +		}
> +		/* nested attribute */
> +		payload += nla_total_size(nhsize);
> +	}
> +
>  	return payload;
>  }
>  
> @@ -1057,25 +1363,37 @@ static int resize_platform_label_table(struct net *net, size_t limit)
>  	/* In case the predefined labels need to be populated */
>  	if (limit > MPLS_LABEL_IPV4NULL) {
>  		struct net_device *lo = net->loopback_dev;
> -		rt0 = mpls_rt_alloc(lo->addr_len);
> +		struct mpls_nh *nh;
> +
> +		rt0 = mpls_rt_alloc(1);
>  		if (!rt0)
>  			goto nort0;
> -		RCU_INIT_POINTER(rt0->rt_dev, lo);
>  		rt0->rt_protocol = RTPROT_KERNEL;
>  		rt0->rt_payload_type = MPT_IPV4;
> -		rt0->rt_via_table = NEIGH_LINK_TABLE;
> -		memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len);
> +		nh = mpls_nh_alloc(lo->addr_len);
> +		if (!nh)
> +			goto nort2;
> +		RCU_INIT_POINTER(nh->nh_dev, lo);
> +		nh->nh_via_table = NEIGH_LINK_TABLE;
> +		memcpy(nh->nh_via, lo->dev_addr, lo->addr_len);
> +		list_add_tail(&nh->nh_next, &rt0->rt_nhs);
>  	}
>  	if (limit > MPLS_LABEL_IPV6NULL) {
>  		struct net_device *lo = net->loopback_dev;
> -		rt2 = mpls_rt_alloc(lo->addr_len);
> +		struct mpls_nh *nh;
> +
> +		rt2 = mpls_rt_alloc(1);
>  		if (!rt2)
>  			goto nort2;
> -		RCU_INIT_POINTER(rt2->rt_dev, lo);
>  		rt2->rt_protocol = RTPROT_KERNEL;
>  		rt2->rt_payload_type = MPT_IPV6;
> -		rt2->rt_via_table = NEIGH_LINK_TABLE;
> -		memcpy(rt2->rt_via, lo->dev_addr, lo->addr_len);
> +		nh = mpls_nh_alloc(lo->addr_len);
> +		if (!nh)
> +			goto nort2;
> +		RCU_INIT_POINTER(nh->nh_dev, lo);
> +		nh->nh_via_table = NEIGH_LINK_TABLE;
> +		memcpy(nh->nh_via, lo->dev_addr, lo->addr_len);
> +		list_add_tail(&nh->nh_next, &rt2->rt_nhs);
>  	}
>  
>  	rtnl_lock();
> @@ -1085,7 +1403,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
>  
>  	/* Free any labels beyond the new table */
>  	for (index = limit; index < old_limit; index++)
> -		mpls_route_update(net, index, NULL, NULL, NULL);
> +		mpls_route_update(net, index, NULL, NULL);
>  
>  	/* Copy over the old labels */
>  	cp_size = size;
> @@ -1124,6 +1442,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
>  
>  nort2:
>  	mpls_rt_free(rt0);
> +	mpls_rt_free(rt2);
>  nort0:
>  	kvfree(labels);
>  nolabels:
> diff --git a/net/mpls/internal.h b/net/mpls/internal.h
> index 2681a4b..9e18b58 100644
> --- a/net/mpls/internal.h
> +++ b/net/mpls/internal.h
> @@ -1,6 +1,17 @@
>  #ifndef MPLS_INTERNAL_H
>  #define MPLS_INTERNAL_H
>  
> +enum mpls_payload_type {
> +	MPT_UNSPEC, /* IPv4 or IPv6 */
> +	MPT_IPV4 = 4,
> +	MPT_IPV6 = 6,
> +
> +	/* Other types not implemented:
> +	 *  - Pseudo-wire with or without control word (RFC4385)
> +	 *  - GAL (RFC5586)
> +	 */
> +};
> +
>  struct mpls_shim_hdr {
>  	__be32 label_stack_entry;
>  };
> @@ -21,6 +32,34 @@ struct mpls_dev {
>  
>  struct sk_buff;
>  
> +#define LABEL_NOT_SPECIFIED (1 << 20)
> +#define MAX_NEW_LABELS 2
> +
> +/* This maximum ha length copied from the definition of struct neighbour */
> +#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
> +
> +struct mpls_nh {
> +	struct net_device __rcu *nh_dev;
> +	u32			nh_label[MAX_NEW_LABELS];
> +	unsigned int		nh_flags;
> +	int                     nh_weight;
> +	int                     nh_power;
> +	struct list_head	nh_next;
> +	u8			nh_labels;
> +	u8			nh_via_alen;
> +	u8			nh_via_table;
> +	u8			nh_via[0];
> +};
> +
> +struct mpls_route {
> +	struct rcu_head		rt_rcu;
> +	u8			rt_protocol;
> +	u8			rt_payload_type;
> +	int                     rt_power;
> +	int			rt_nhn;
> +	struct list_head        rt_nhs;
> +};
> +
>  static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
>  {
>  	return (struct mpls_shim_hdr *)skb_network_header(skb);
> @@ -52,8 +91,10 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *
>  
>  int nla_put_labels(struct sk_buff *skb, int attrtype,  u8 labels,
>  		   const u32 label[]);
> -int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels,
> +int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels,
>  		   u32 label[]);
> +int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table,
> +		u8 via[]);
>  bool mpls_output_possible(const struct net_device *dev);
>  unsigned int mpls_dev_mtu(const struct net_device *dev);
>  bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu);
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Oct. 6, 2015, 8:11 p.m. UTC | #2
ebiederm@xmission.com (Eric W. Biederman) writes:

> Roopa Prabhu <roopa@cumulusnetworks.com> writes:
>
>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>
>> This patch adds support for MPLS multipath routes.
>>
>> Includes following changes to support multipath:
>> - splits struct mpls_route into 'struct mpls_route + struct mpls_nh'
>>
>> - 'struct mpls_nh' represents a mpls nexthop label forwarding entry
>>
>> - moves mpls route and nexthop structures into internal.h
>>
>> - A mpls_route can point to multiple mpls_nh structs
>>
>> - the nexthops are maintained as a list
>
> So I am not certain I like nexthops being a list.  In the practical case
> introducing this list guarantees that everyone will see at least an
> extra cache line miss in the forwarding path.
>
> In the more abstract sense a list is the wrong data structure.  If the
> list is so short we can afford to walk it an array is a better data
> structure.  If we need enough entries to make the memory consumption
> of an array a concern we want some kind of hash table or tree data
> structure, because a list will be too long in that case.
>
> So can we please not use a list?
>
> I expect we can simplify the data structures by noting that rt_via must
> be an ethernet mac today so that 6 bytes are enough and 8 bytes gives us
> a bit extra and aligns things nicely.

Grr. My mistake.  The current worst case is 16 bytes for an ipv6
address in rt_via.  But the point remains that a fixed sized array of
bytes in rt_via allows the use of an array and not a list for nexthops.

At least for the single nexthop case I really want something that is
small enough it fits in a single 64byte cache line.  The performance
compared to anything else is going to be noticable.

Eric

> Also I know it goes away in the next patch but a spinlock taken for
> every transit through the forwarding path really bugs me.
>
> Eric
>
>> - In the process of restructuring, this patch also consistently changes all
>> labels to u8
>>
>> - Adds support to parse/fill RTA_MULTIPATH netlink attribute for
>> multipath routes similar to ipv4/v6 fib
>>
>> - In this patch, the multipath route nexthop selection algorithm
>> is a simple round robin picked up from ipv4 fib code and is replaced by
>> a hash based algorithm from Robert Shearman in the next patch
>>
>> - mpls_route_update cleanup: remove 'dev' handling in mpls_route_update.
>> mpls_route_update though implemented to update based on dev, it was never
>> used that way. And the dev handling gets tricky with multiple nexthops. Cannot
>> match against any single nexthops dev. So, this patch removes the unused
>> 'dev' handling in mpls_route_update.
>
>>
>> Example:
>>
>> $ip -f mpls route add 100 nexthop as 200 via inet 10.1.1.2 dev swp1 \
>>                 nexthop as 700 via inet 10.1.1.6 dev swp2 \
>>                 nexthop as 800 via inet 40.1.1.2 dev swp3
>>
>> $ip  -f mpls route show
>> 100
>>         nexthop as to 200 via inet 10.1.1.2  dev swp1
>>         nexthop as to 700 via inet 10.1.1.6  dev swp2
>>         nexthop as to 800 via inet 40.1.1.2  dev swp3
>>
>> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
>> ---
>>  include/net/mpls_iptunnel.h |   2 +-
>>  net/mpls/af_mpls.c          | 627 +++++++++++++++++++++++++++++++++-----------
>>  net/mpls/internal.h         |  43 ++-
>>  3 files changed, 516 insertions(+), 156 deletions(-)
>>
>> diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
>> index 4757997..179253f 100644
>> --- a/include/net/mpls_iptunnel.h
>> +++ b/include/net/mpls_iptunnel.h
>> @@ -18,7 +18,7 @@
>>  
>>  struct mpls_iptunnel_encap {
>>  	u32	label[MAX_NEW_LABELS];
>> -	u32	labels;
>> +	u8	labels;
>>  };
>>  
>>  static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate)
>> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
>> index 8c5707d..ae9e153 100644
>> --- a/net/mpls/af_mpls.c
>> +++ b/net/mpls/af_mpls.c
>> @@ -19,39 +19,12 @@
>>  #include <net/ipv6.h>
>>  #include <net/addrconf.h>
>>  #endif
>> +#include <net/nexthop.h>
>>  #include "internal.h"
>>  
>> -#define LABEL_NOT_SPECIFIED (1<<20)
>> -#define MAX_NEW_LABELS 2
>> -
>> -/* This maximum ha length copied from the definition of struct neighbour */
>> -#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
>> -
>> -enum mpls_payload_type {
>> -	MPT_UNSPEC, /* IPv4 or IPv6 */
>> -	MPT_IPV4 = 4,
>> -	MPT_IPV6 = 6,
>> -
>> -	/* Other types not implemented:
>> -	 *  - Pseudo-wire with or without control word (RFC4385)
>> -	 *  - GAL (RFC5586)
>> -	 */
>> -};
>> -
>> -struct mpls_route { /* next hop label forwarding entry */
>> -	struct net_device __rcu *rt_dev;
>> -	struct rcu_head		rt_rcu;
>> -	u32			rt_label[MAX_NEW_LABELS];
>> -	u8			rt_protocol; /* routing protocol that set this entry */
>> -	u8                      rt_payload_type;
>> -	u8			rt_labels;
>> -	u8			rt_via_alen;
>> -	u8			rt_via_table;
>> -	u8			rt_via[0];
>> -};
>> -
>>  static int zero = 0;
>>  static int label_limit = (1 << 20) - 1;
>> +static DEFINE_SPINLOCK(mpls_multipath_lock);
>>  
>>  static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
>>  		       struct nlmsghdr *nlh, struct net *net, u32 portid,
>> @@ -80,10 +53,10 @@ bool mpls_output_possible(const struct net_device *dev)
>>  }
>>  EXPORT_SYMBOL_GPL(mpls_output_possible);
>>  
>> -static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
>> +static unsigned int mpls_nh_header_size(const struct mpls_nh *nh)
>>  {
>>  	/* The size of the layer 2.5 labels to be added for this route */
>> -	return rt->rt_labels * sizeof(struct mpls_shim_hdr);
>> +	return nh->nh_labels * sizeof(struct mpls_shim_hdr);
>>  }
>>  
>>  unsigned int mpls_dev_mtu(const struct net_device *dev)
>> @@ -105,8 +78,58 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
>>  }
>>  EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
>>  
>> -static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
>> -			struct mpls_entry_decoded dec)
>> +/* This is a cut/copy/modify from fib_select_multipath */
>> +static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt)
>> +{
>> +	struct mpls_nh *nh;
>> +	struct mpls_nh *ret_nh;
>> +	int nhsel = 0;
>> +	int w;
>> +
>> +	spin_lock_bh(&mpls_multipath_lock);
>> +	ret_nh = list_first_entry_or_null(&rt->rt_nhs, struct mpls_nh,
>> +					  nh_next);
>> +	if (rt->rt_power <= 0) {
>> +		int power = 0;
>> +
>> +		list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
>> +			power += nh->nh_weight;
>> +			nh->nh_power = nh->nh_weight;
>> +		}
>> +		rt->rt_power = power;
>> +		if (power <= 0) {
>> +			spin_unlock_bh(&mpls_multipath_lock);
>> +			/* Race condition: route has just become dead. */
>> +			return ret_nh;
>> +		}
>> +	}
>> +
>> +	/* w should be random number [0..rt->rt_power-1],
>> +	 * it is pretty bad approximation.
>> +	 */
>> +	w = jiffies % rt->rt_power;
>> +
>> +	list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
>> +		if (nh->nh_power) {
>> +			w -= nh->nh_power;
>> +			if (w <= 0) {
>> +				nh->nh_power--;
>> +				rt->rt_power--;
>> +				ret_nh = nh;
>> +				spin_unlock_bh(&mpls_multipath_lock);
>> +				return ret_nh;
>> +			}
>> +		}
>> +		nhsel++;
>> +	}
>> +
>> +	/* Race condition: route has just become dead. */
>> +	spin_unlock_bh(&mpls_multipath_lock);
>> +	return ret_nh;
>> +}
>> +
>> +static bool mpls_egress(struct mpls_route *rt, struct mpls_nh *nh,
>> +			struct sk_buff *skb, struct mpls_entry_decoded dec)
>>  {
>>  	enum mpls_payload_type payload_type;
>>  	bool success = false;
>> @@ -159,6 +182,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
>>  	struct net *net = dev_net(dev);
>>  	struct mpls_shim_hdr *hdr;
>>  	struct mpls_route *rt;
>> +	struct mpls_nh *nh;
>>  	struct mpls_entry_decoded dec;
>>  	struct net_device *out_dev;
>>  	struct mpls_dev *mdev;
>> @@ -196,9 +220,13 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
>>  	if (!rt)
>>  		goto drop;
>>  
>> +	nh = mpls_select_multipath(rt);
>> +	if (!nh)
>> +		goto drop;
>> +
>>  	/* Find the output device */
>> -	out_dev = rcu_dereference(rt->rt_dev);
>> -	if (!mpls_output_possible(out_dev))
>> +	out_dev = rcu_dereference(nh->nh_dev);
>> +	if (!out_dev || !mpls_output_possible(out_dev))
>>  		goto drop;
>>  
>>  	if (skb_warn_if_lro(skb))
>> @@ -212,7 +240,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
>>  	dec.ttl -= 1;
>>  
>>  	/* Verify the destination can hold the packet */
>> -	new_header_size = mpls_rt_header_size(rt);
>> +	new_header_size = mpls_nh_header_size(nh);
>>  	mtu = mpls_dev_mtu(out_dev);
>>  	if (mpls_pkt_too_big(skb, mtu - new_header_size))
>>  		goto drop;
>> @@ -230,7 +258,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
>>  
>>  	if (unlikely(!new_header_size && dec.bos)) {
>>  		/* Penultimate hop popping */
>> -		if (!mpls_egress(rt, skb, dec))
>> +		if (!mpls_egress(rt, nh, skb, dec))
>>  			goto drop;
>>  	} else {
>>  		bool bos;
>> @@ -240,13 +268,14 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
>>  		/* Push the new labels */
>>  		hdr = mpls_hdr(skb);
>>  		bos = dec.bos;
>> -		for (i = rt->rt_labels - 1; i >= 0; i--) {
>> -			hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, bos);
>> +		for (i = nh->nh_labels - 1; i >= 0; i--) {
>> +			hdr[i] = mpls_entry_encode(nh->nh_label[i],
>> +						   dec.ttl, 0, bos);
>>  			bos = false;
>>  		}
>>  	}
>>  
>> -	err = neigh_xmit(rt->rt_via_table, out_dev, rt->rt_via, skb);
>> +	err = neigh_xmit(nh->nh_via_table, out_dev, nh->nh_via, skb);
>>  	if (err)
>>  		net_dbg_ratelimited("%s: packet transmission failed: %d\n",
>>  				    __func__, err);
>> @@ -270,31 +299,43 @@ static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = {
>>  struct mpls_route_config {
>>  	u32			rc_protocol;
>>  	u32			rc_ifindex;
>> -	u16			rc_via_table;
>> -	u16			rc_via_alen;
>> +	u8			rc_via_table;
>> +	u8			rc_via_alen;
>>  	u8			rc_via[MAX_VIA_ALEN];
>> +	u8			rc_output_labels;
>>  	u32			rc_label;
>> -	u32			rc_output_labels;
>>  	u32			rc_output_label[MAX_NEW_LABELS];
>>  	u32			rc_nlflags;
>>  	enum mpls_payload_type	rc_payload_type;
>>  	struct nl_info		rc_nlinfo;
>> +	struct rtnexthop	*rc_mp;
>> +	int			rc_mp_len;
>>  };
>>  
>> -static struct mpls_route *mpls_rt_alloc(size_t alen)
>> +static struct mpls_route *mpls_rt_alloc(int num_nh)
>>  {
>>  	struct mpls_route *rt;
>>  
>> -	rt = kzalloc(sizeof(*rt) + alen, GFP_KERNEL);
>> -	if (rt)
>> -		rt->rt_via_alen = alen;
>> +	rt = kzalloc(sizeof(*rt), GFP_KERNEL);
>> +	if (rt) {
>> +		rt->rt_nhn = num_nh;
>> +		INIT_LIST_HEAD(&rt->rt_nhs);
>> +	}
>> +
>>  	return rt;
>>  }
>>  
>>  static void mpls_rt_free(struct mpls_route *rt)
>>  {
>> -	if (rt)
>> +	struct mpls_nh *nh, *nh_safe;
>> +
>> +	if (rt) {
>> +		list_for_each_entry_safe(nh, nh_safe, &rt->rt_nhs, nh_next) {
>> +			list_del(&nh->nh_next);
>> +			kfree(nh);
>> +		}
>>  		kfree_rcu(rt, rt_rcu);
>> +	}
>>  }
>>  
>>  static void mpls_notify_route(struct net *net, unsigned index,
>> @@ -312,25 +353,22 @@ static void mpls_notify_route(struct net *net, unsigned index,
>>  }
>>  
>>  static void mpls_route_update(struct net *net, unsigned index,
>> -			      struct net_device *dev, struct mpls_route *new,
>> +			      struct mpls_route *new,
>>  			      const struct nl_info *info)
>>  {
>>  	struct mpls_route __rcu **platform_label;
>> -	struct mpls_route *rt, *old = NULL;
>> +	struct mpls_route *rt;
>>  
>>  	ASSERT_RTNL();
>>  
>>  	platform_label = rtnl_dereference(net->mpls.platform_label);
>>  	rt = rtnl_dereference(platform_label[index]);
>> -	if (!dev || (rt && (rtnl_dereference(rt->rt_dev) == dev))) {
>> -		rcu_assign_pointer(platform_label[index], new);
>> -		old = rt;
>> -	}
>> +	rcu_assign_pointer(platform_label[index], new);
>>  
>> -	mpls_notify_route(net, index, old, new, info);
>> +	mpls_notify_route(net, index, rt, new, info);
>>  
>>  	/* If we removed a route free it now */
>> -	mpls_rt_free(old);
>> +	mpls_rt_free(rt);
>>  }
>>  
>>  static unsigned find_free_label(struct net *net)
>> @@ -406,23 +444,23 @@ static struct net_device *inet6_fib_lookup_dev(struct net *net, void *addr)
>>  #endif
>>  
>>  static struct net_device *find_outdev(struct net *net,
>> -				      struct mpls_route_config *cfg)
>> +				      struct mpls_nh *nh, int oif)
>>  {
>>  	struct net_device *dev = NULL;
>>  
>> -	if (!cfg->rc_ifindex) {
>> -		switch (cfg->rc_via_table) {
>> +	if (!oif) {
>> +		switch (nh->nh_via_table) {
>>  		case NEIGH_ARP_TABLE:
>> -			dev = inet_fib_lookup_dev(net, cfg->rc_via);
>> +			dev = inet_fib_lookup_dev(net, nh->nh_via);
>>  			break;
>>  		case NEIGH_ND_TABLE:
>> -			dev = inet6_fib_lookup_dev(net, cfg->rc_via);
>> +			dev = inet6_fib_lookup_dev(net, nh->nh_via);
>>  			break;
>>  		case NEIGH_LINK_TABLE:
>>  			break;
>>  		}
>>  	} else {
>> -		dev = dev_get_by_index(net, cfg->rc_ifindex);
>> +		dev = dev_get_by_index(net, oif);
>>  	}
>>  
>>  	if (!dev)
>> @@ -431,15 +469,208 @@ static struct net_device *find_outdev(struct net *net,
>>  	return dev;
>>  }
>>  
>> +static int mpls_nh_assign_dev(struct net *net, struct mpls_nh *nh, int oif)
>> +{
>> +	struct net_device *dev = NULL;
>> +	int err = -ENODEV;
>> +
>> +	dev = find_outdev(net, nh, oif);
>> +	if (IS_ERR(dev)) {
>> +		err = PTR_ERR(dev);
>> +		dev = NULL;
>> +		goto errout;
>> +	}
>> +
>> +	/* Ensure this is a supported device */
>> +	err = -EINVAL;
>> +	if (!mpls_dev_get(dev))
>> +		goto errout;
>> +
>> +	RCU_INIT_POINTER(nh->nh_dev, dev);
>> +	dev_put(dev);
>> +
>> +	return 0;
>> +
>> +errout:
>> +	if (dev)
>> +		dev_put(dev);
>> +	return err;
>> +}
>> +
>> +static struct mpls_nh *mpls_nh_alloc(size_t alen)
>> +{
>> +	struct mpls_nh *nh;
>> +
>> +	nh = kzalloc(sizeof(*nh) + alen, GFP_KERNEL);
>> +	if (nh)
>> +		nh->nh_via_alen = alen;
>> +
>> +	return nh;
>> +}
>> +
>> +static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg,
>> +				  struct mpls_route *rt)
>> +{
>> +	struct net *net = cfg->rc_nlinfo.nl_net;
>> +	struct mpls_nh *nh = NULL;
>> +	int err;
>> +	int i;
>> +
>> +	err = -EINVAL;
>> +	/* Ensure only a supported number of labels are present */
>> +	if (cfg->rc_output_labels > MAX_NEW_LABELS)
>> +		goto errout;
>> +
>> +	err = -ENOMEM;
>> +	nh = mpls_nh_alloc(cfg->rc_via_alen);
>> +	if (!nh)
>> +		goto errout;
>> +
>> +	nh->nh_labels = cfg->rc_output_labels;
>> +	for (i = 0; i < nh->nh_labels; i++)
>> +		nh->nh_label[i] = cfg->rc_output_label[i];
>> +
>> +	nh->nh_via_table = cfg->rc_via_table;
>> +	memcpy(nh->nh_via, cfg->rc_via, cfg->rc_via_alen);
>> +	nh->nh_via_alen = cfg->rc_via_alen;
>> +
>> +	err = mpls_nh_assign_dev(net, nh, cfg->rc_ifindex);
>> +	if (err)
>> +		goto errout;
>> +
>> +	list_add_tail(&nh->nh_next, &rt->rt_nhs);
>> +
>> +	return 0;
>> +
>> +errout:
>> +	kfree(nh);
>> +
>> +	return err;
>> +}
>> +
>> +static int mpls_nh_build(struct net *net, struct mpls_nh **rt_nh,
>> +			 int oif, struct nlattr *via_attr,
>> +			 struct nlattr *newdst)
>> +{
>> +	struct mpls_nh *nh = NULL;
>> +	int err;
>> +	u8 via_alen;
>> +	u8 via_table;
>> +	u8 via[MAX_VIA_ALEN];
>> +
>> +	err = nla_get_via(via_attr, &via_alen, &via_table,
>> +			  via);
>> +	if (err)
>> +		goto errout;
>> +
>> +	nh = mpls_nh_alloc(via_alen);
>> +	if (!nh)
>> +		goto errout;
>> +
>> +	if (newdst) {
>> +		err = nla_get_labels(newdst, MAX_NEW_LABELS,
>> +				     &nh->nh_labels, nh->nh_label);
>> +		if (err)
>> +			goto errout;
>> +	}
>> +	nh->nh_via_table = via_table;
>> +	memcpy(nh->nh_via, via, via_alen);
>> +
>> +	err = mpls_nh_assign_dev(net, nh, oif);
>> +	if (err)
>> +		goto errout;
>> +
>> +	*rt_nh = nh;
>> +
>> +	return 0;
>> +
>> +errout:
>> +	kfree(nh);
>> +
>> +	return err;
>> +}
>> +
>> +static int mpls_count_nexthops(struct rtnexthop *rtnh, int len)
>> +{
>> +	int nhs = 0;
>> +	int remaining = len;
>> +
>> +	while (rtnh_ok(rtnh, remaining)) {
>> +		nhs++;
>> +		rtnh = rtnh_next(rtnh, &remaining);
>> +	}
>> +
>> +	/* leftover implies invalid nexthop configuration, discard it */
>> +	return remaining > 0 ? 0 : nhs;
>> +}
>> +
>> +static int mpls_nh_build_multi(struct mpls_route_config *cfg,
>> +			       struct mpls_route *rt)
>> +{
>> +	struct rtnexthop *rtnh = cfg->rc_mp;
>> +	struct nlattr *nla_via, *nla_newdst;
>> +	int remaining = cfg->rc_mp_len;
>> +	struct mpls_nh *nh, *nh_safe;
>> +	int nhs = 0;
>> +	int err = 0;
>> +
>> +	while (rtnh_ok(rtnh, remaining)) {
>> +		int attrlen;
>> +
>> +		nla_via = NULL;
>> +		nla_newdst = NULL;
>> +		nh = NULL;
>> +
>> +		err = -EINVAL;
>> +		if (!rtnh_ok(rtnh, remaining))
>> +			goto errout;
>> +
>> +		attrlen = rtnh_attrlen(rtnh);
>> +		if (attrlen > 0) {
>> +			struct nlattr *attrs = rtnh_attrs(rtnh);
>> +
>> +			nla_via = nla_find(attrs, attrlen, RTA_VIA);
>> +			nla_newdst = nla_find(attrs, attrlen, RTA_NEWDST);
>> +		}
>> +
>> +		err = -EINVAL;
>> +		if (!nla_via)
>> +			goto errout;
>> +
>> +		err = mpls_nh_build(cfg->rc_nlinfo.nl_net, &nh,
>> +				    rtnh->rtnh_ifindex, nla_via,
>> +				    nla_newdst);
>> +		if (err)
>> +			goto errout;
>> +
>> +		nh->nh_weight = rtnh->rtnh_hops + 1;
>> +		list_add_tail(&nh->nh_next, &rt->rt_nhs);
>> +
>> +		rtnh = rtnh_next(rtnh, &remaining);
>> +		nhs++;
>> +	}
>> +
>> +	rt->rt_nhn = nhs;
>> +
>> +	return 0;
>> +
>> +errout:
>> +	list_for_each_entry_safe(nh, nh_safe, &rt->rt_nhs, nh_next) {
>> +		list_del(&nh->nh_next);
>> +		kfree(nh);
>> +	}
>> +
>> +	return err;
>> +}
>> +
>>  static int mpls_route_add(struct mpls_route_config *cfg)
>>  {
>>  	struct mpls_route __rcu **platform_label;
>>  	struct net *net = cfg->rc_nlinfo.nl_net;
>> -	struct net_device *dev = NULL;
>>  	struct mpls_route *rt, *old;
>> -	unsigned index;
>> -	int i;
>>  	int err = -EINVAL;
>> +	unsigned index;
>> +	int nhs = 1; /* default to one nexthop */
>>  
>>  	index = cfg->rc_label;
>>  
>> @@ -457,27 +688,6 @@ static int mpls_route_add(struct mpls_route_config *cfg)
>>  	if (index >= net->mpls.platform_labels)
>>  		goto errout;
>>  
>> -	/* Ensure only a supported number of labels are present */
>> -	if (cfg->rc_output_labels > MAX_NEW_LABELS)
>> -		goto errout;
>> -
>> -	dev = find_outdev(net, cfg);
>> -	if (IS_ERR(dev)) {
>> -		err = PTR_ERR(dev);
>> -		dev = NULL;
>> -		goto errout;
>> -	}
>> -
>> -	/* Ensure this is a supported device */
>> -	err = -EINVAL;
>> -	if (!mpls_dev_get(dev))
>> -		goto errout;
>> -
>> -	err = -EINVAL;
>> -	if ((cfg->rc_via_table == NEIGH_LINK_TABLE) &&
>> -	    (dev->addr_len != cfg->rc_via_alen))
>> -		goto errout;
>> -
>>  	/* Append makes no sense with mpls */
>>  	err = -EOPNOTSUPP;
>>  	if (cfg->rc_nlflags & NLM_F_APPEND)
>> @@ -497,28 +707,34 @@ static int mpls_route_add(struct mpls_route_config *cfg)
>>  	if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old)
>>  		goto errout;
>>  
>> +	if (cfg->rc_mp) {
>> +		err = -EINVAL;
>> +		nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len);
>> +		if (nhs == 0)
>> +			goto errout;
>> +	}
>> +
>>  	err = -ENOMEM;
>> -	rt = mpls_rt_alloc(cfg->rc_via_alen);
>> +	rt = mpls_rt_alloc(nhs);
>>  	if (!rt)
>>  		goto errout;
>> -
>> -	rt->rt_labels = cfg->rc_output_labels;
>> -	for (i = 0; i < rt->rt_labels; i++)
>> -		rt->rt_label[i] = cfg->rc_output_label[i];
>>  	rt->rt_protocol = cfg->rc_protocol;
>> -	RCU_INIT_POINTER(rt->rt_dev, dev);
>>  	rt->rt_payload_type = cfg->rc_payload_type;
>> -	rt->rt_via_table = cfg->rc_via_table;
>> -	memcpy(rt->rt_via, cfg->rc_via, cfg->rc_via_alen);
>>  
>> -	mpls_route_update(net, index, NULL, rt, &cfg->rc_nlinfo);
>> +	if (cfg->rc_mp)
>> +		err = mpls_nh_build_multi(cfg, rt);
>> +	else
>> +		err = mpls_nh_build_from_cfg(cfg, rt);
>> +	if (err)
>> +		goto freert;
>> +
>> +	mpls_route_update(net, index, rt, &cfg->rc_nlinfo);
>>  
>> -	dev_put(dev);
>>  	return 0;
>>  
>> +freert:
>> +	mpls_rt_free(rt);
>>  errout:
>> -	if (dev)
>> -		dev_put(dev);
>>  	return err;
>>  }
>>  
>> @@ -538,7 +754,7 @@ static int mpls_route_del(struct mpls_route_config *cfg)
>>  	if (index >= net->mpls.platform_labels)
>>  		goto errout;
>>  
>> -	mpls_route_update(net, index, NULL, NULL, &cfg->rc_nlinfo);
>> +	mpls_route_update(net, index, NULL, &cfg->rc_nlinfo);
>>  
>>  	err = 0;
>>  errout:
>> @@ -628,6 +844,7 @@ static void mpls_ifdown(struct net_device *dev)
>>  	struct mpls_route __rcu **platform_label;
>>  	struct net *net = dev_net(dev);
>>  	struct mpls_dev *mdev;
>> +	struct mpls_nh *nh;
>>  	unsigned index;
>>  
>>  	platform_label = rtnl_dereference(net->mpls.platform_label);
>> @@ -635,9 +852,14 @@ static void mpls_ifdown(struct net_device *dev)
>>  		struct mpls_route *rt = rtnl_dereference(platform_label[index]);
>>  		if (!rt)
>>  			continue;
>> -		if (rtnl_dereference(rt->rt_dev) != dev)
>> -			continue;
>> -		rt->rt_dev = NULL;
>> +		list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
>> +			struct net_device *mdev;
>> +
>> +			mdev = rtnl_dereference(nh->nh_dev);
>> +			if (mdev != dev)
>> +				continue;
>> +			nh->nh_dev = NULL;
>> +		}
>>  	}
>>  
>>  	mdev = mpls_dev_get(dev);
>> @@ -736,7 +958,7 @@ int nla_put_labels(struct sk_buff *skb, int attrtype,
>>  EXPORT_SYMBOL_GPL(nla_put_labels);
>>  
>>  int nla_get_labels(const struct nlattr *nla,
>> -		   u32 max_labels, u32 *labels, u32 label[])
>> +		   u8 max_labels, u8 *labels, u32 label[])
>>  {
>>  	unsigned len = nla_len(nla);
>>  	unsigned nla_labels;
>> @@ -781,6 +1003,48 @@ int nla_get_labels(const struct nlattr *nla,
>>  }
>>  EXPORT_SYMBOL_GPL(nla_get_labels);
>>  
>> +int nla_get_via(const struct nlattr *nla, u8 *via_alen,
>> +		u8 *via_table, u8 via_addr[])
>> +{
>> +	struct rtvia *via = nla_data(nla);
>> +	int err = -EINVAL;
>> +	u8 alen;
>> +
>> +	if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr))
>> +		goto errout;
>> +	alen = nla_len(nla) -
>> +			offsetof(struct rtvia, rtvia_addr);
>> +	if (alen > MAX_VIA_ALEN)
>> +		goto errout;
>> +
>> +	/* Validate the address family */
>> +	switch (via->rtvia_family) {
>> +	case AF_PACKET:
>> +		*via_table = NEIGH_LINK_TABLE;
>> +		break;
>> +	case AF_INET:
>> +		*via_table = NEIGH_ARP_TABLE;
>> +		if (alen != 4)
>> +			goto errout;
>> +		break;
>> +	case AF_INET6:
>> +		*via_table = NEIGH_ND_TABLE;
>> +		if (alen != 16)
>> +			goto errout;
>> +		break;
>> +	default:
>> +		/* Unsupported address family */
>> +		goto errout;
>> +	}
>> +
>> +	memcpy(via_addr, via->rtvia_addr, alen);
>> +	*via_alen = alen;
>> +	err = 0;
>> +
>> +errout:
>> +	return err;
>> +}
>> +
>>  static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
>>  			       struct mpls_route_config *cfg)
>>  {
>> @@ -844,7 +1108,7 @@ static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
>>  			break;
>>  		case RTA_DST:
>>  		{
>> -			u32 label_count;
>> +			u8 label_count;
>>  			if (nla_get_labels(nla, 1, &label_count,
>>  					   &cfg->rc_label))
>>  				goto errout;
>> @@ -857,35 +1121,15 @@ static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
>>  		}
>>  		case RTA_VIA:
>>  		{
>> -			struct rtvia *via = nla_data(nla);
>> -			if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr))
>> +			if (nla_get_via(nla, &cfg->rc_via_alen,
>> +					&cfg->rc_via_table, cfg->rc_via))
>>  				goto errout;
>> -			cfg->rc_via_alen   = nla_len(nla) -
>> -				offsetof(struct rtvia, rtvia_addr);
>> -			if (cfg->rc_via_alen > MAX_VIA_ALEN)
>> -				goto errout;
>> -
>> -			/* Validate the address family */
>> -			switch(via->rtvia_family) {
>> -			case AF_PACKET:
>> -				cfg->rc_via_table = NEIGH_LINK_TABLE;
>> -				break;
>> -			case AF_INET:
>> -				cfg->rc_via_table = NEIGH_ARP_TABLE;
>> -				if (cfg->rc_via_alen != 4)
>> -					goto errout;
>> -				break;
>> -			case AF_INET6:
>> -				cfg->rc_via_table = NEIGH_ND_TABLE;
>> -				if (cfg->rc_via_alen != 16)
>> -					goto errout;
>> -				break;
>> -			default:
>> -				/* Unsupported address family */
>> -				goto errout;
>> -			}
>> -
>> -			memcpy(cfg->rc_via, via->rtvia_addr, cfg->rc_via_alen);
>> +			break;
>> +		}
>> +		case RTA_MULTIPATH:
>> +		{
>> +			cfg->rc_mp = nla_data(nla);
>> +			cfg->rc_mp_len = nla_len(nla);
>>  			break;
>>  		}
>>  		default:
>> @@ -946,16 +1190,56 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
>>  	rtm->rtm_type = RTN_UNICAST;
>>  	rtm->rtm_flags = 0;
>>  
>> -	if (rt->rt_labels &&
>> -	    nla_put_labels(skb, RTA_NEWDST, rt->rt_labels, rt->rt_label))
>> -		goto nla_put_failure;
>> -	if (nla_put_via(skb, rt->rt_via_table, rt->rt_via, rt->rt_via_alen))
>> -		goto nla_put_failure;
>> -	dev = rtnl_dereference(rt->rt_dev);
>> -	if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
>> -		goto nla_put_failure;
>>  	if (nla_put_labels(skb, RTA_DST, 1, &label))
>>  		goto nla_put_failure;
>> +	if (rt->rt_nhn == 1) {
>> +		struct mpls_nh *nh = list_first_entry_or_null(&rt->rt_nhs,
>> +							struct mpls_nh,
>> +							nh_next);
>> +
>> +		if (nh->nh_labels &&
>> +		    nla_put_labels(skb, RTA_NEWDST, nh->nh_labels,
>> +				   nh->nh_label))
>> +			goto nla_put_failure;
>> +		if (nla_put_via(skb, nh->nh_via_table, nh->nh_via,
>> +				nh->nh_via_alen))
>> +			goto nla_put_failure;
>> +		dev = rtnl_dereference(nh->nh_dev);
>> +		if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
>> +			goto nla_put_failure;
>> +	} else {
>> +		struct rtnexthop *rtnh;
>> +		struct nlattr *mp;
>> +		struct mpls_nh *nh;
>> +
>> +		mp = nla_nest_start(skb, RTA_MULTIPATH);
>> +		if (!mp)
>> +			goto nla_put_failure;
>> +
>> +		list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
>> +			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
>> +			if (!rtnh)
>> +				goto nla_put_failure;
>> +
>> +			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
>> +			dev = rtnl_dereference(nh->nh_dev);
>> +			if (dev)
>> +				rtnh->rtnh_ifindex = dev->ifindex;
>> +			if (nh->nh_labels &&
>> +			    nla_put_labels(skb, RTA_NEWDST, nh->nh_labels,
>> +					   nh->nh_label))
>> +				goto nla_put_failure;
>> +			if (nla_put_via(skb, nh->nh_via_table,
>> +					nh->nh_via,
>> +					nh->nh_via_alen))
>> +				goto nla_put_failure;
>> +
>> +			/* length of rtnetlink header + attributes */
>> +			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
>> +		}
>> +
>> +		nla_nest_end(skb, mp);
>> +	}
>>  
>>  	nlmsg_end(skb, nlh);
>>  	return 0;
>> @@ -1000,12 +1284,34 @@ static inline size_t lfib_nlmsg_size(struct mpls_route *rt)
>>  {
>>  	size_t payload =
>>  		NLMSG_ALIGN(sizeof(struct rtmsg))
>> -		+ nla_total_size(2 + rt->rt_via_alen)	/* RTA_VIA */
>>  		+ nla_total_size(4);			/* RTA_DST */
>> -	if (rt->rt_labels)				/* RTA_NEWDST */
>> -		payload += nla_total_size(rt->rt_labels * 4);
>> -	if (rt->rt_dev)					/* RTA_OIF */
>> -		payload += nla_total_size(4);
>> +
>> +	if (rt->rt_nhn == 1) {
>> +		struct mpls_nh *nh = list_first_entry_or_null(&rt->rt_nhs,
>> +							      struct mpls_nh,
>> +							      nh_next);
>> +
>> +		if (nh->nh_dev)
>> +			payload += nla_total_size(4); /* RTA_OIF */
>> +		payload += nla_total_size(2 + nh->nh_via_alen); /* RTA_VIA */
>> +		if (nh->nh_labels) /* RTA_NEWDST */
>> +			payload += nla_total_size(nh->nh_labels * 4);
>> +	} else {
>> +		struct mpls_nh *nh;
>> +		/* each nexthop is packed in an attribute */
>> +		size_t nhsize = 0;
>> +
>> +		list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
>> +			nhsize += nla_total_size(sizeof(struct rtnexthop)) +
>> +					nla_total_size(nh->nh_via_alen +
>> +						       2); /* RTA_VIA */
>> +			if (nh->nh_labels) /* RTA_NEWDST */
>> +				nhsize += nla_total_size(nh->nh_labels * 4);
>> +		}
>> +		/* nested attribute */
>> +		payload += nla_total_size(nhsize);
>> +	}
>> +
>>  	return payload;
>>  }
>>  
>> @@ -1057,25 +1363,37 @@ static int resize_platform_label_table(struct net *net, size_t limit)
>>  	/* In case the predefined labels need to be populated */
>>  	if (limit > MPLS_LABEL_IPV4NULL) {
>>  		struct net_device *lo = net->loopback_dev;
>> -		rt0 = mpls_rt_alloc(lo->addr_len);
>> +		struct mpls_nh *nh;
>> +
>> +		rt0 = mpls_rt_alloc(1);
>>  		if (!rt0)
>>  			goto nort0;
>> -		RCU_INIT_POINTER(rt0->rt_dev, lo);
>>  		rt0->rt_protocol = RTPROT_KERNEL;
>>  		rt0->rt_payload_type = MPT_IPV4;
>> -		rt0->rt_via_table = NEIGH_LINK_TABLE;
>> -		memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len);
>> +		nh = mpls_nh_alloc(lo->addr_len);
>> +		if (!nh)
>> +			goto nort2;
>> +		RCU_INIT_POINTER(nh->nh_dev, lo);
>> +		nh->nh_via_table = NEIGH_LINK_TABLE;
>> +		memcpy(nh->nh_via, lo->dev_addr, lo->addr_len);
>> +		list_add_tail(&nh->nh_next, &rt0->rt_nhs);
>>  	}
>>  	if (limit > MPLS_LABEL_IPV6NULL) {
>>  		struct net_device *lo = net->loopback_dev;
>> -		rt2 = mpls_rt_alloc(lo->addr_len);
>> +		struct mpls_nh *nh;
>> +
>> +		rt2 = mpls_rt_alloc(1);
>>  		if (!rt2)
>>  			goto nort2;
>> -		RCU_INIT_POINTER(rt2->rt_dev, lo);
>>  		rt2->rt_protocol = RTPROT_KERNEL;
>>  		rt2->rt_payload_type = MPT_IPV6;
>> -		rt2->rt_via_table = NEIGH_LINK_TABLE;
>> -		memcpy(rt2->rt_via, lo->dev_addr, lo->addr_len);
>> +		nh = mpls_nh_alloc(lo->addr_len);
>> +		if (!nh)
>> +			goto nort2;
>> +		RCU_INIT_POINTER(nh->nh_dev, lo);
>> +		nh->nh_via_table = NEIGH_LINK_TABLE;
>> +		memcpy(nh->nh_via, lo->dev_addr, lo->addr_len);
>> +		list_add_tail(&nh->nh_next, &rt2->rt_nhs);
>>  	}
>>  
>>  	rtnl_lock();
>> @@ -1085,7 +1403,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
>>  
>>  	/* Free any labels beyond the new table */
>>  	for (index = limit; index < old_limit; index++)
>> -		mpls_route_update(net, index, NULL, NULL, NULL);
>> +		mpls_route_update(net, index, NULL, NULL);
>>  
>>  	/* Copy over the old labels */
>>  	cp_size = size;
>> @@ -1124,6 +1442,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
>>  
>>  nort2:
>>  	mpls_rt_free(rt0);
>> +	mpls_rt_free(rt2);
>>  nort0:
>>  	kvfree(labels);
>>  nolabels:
>> diff --git a/net/mpls/internal.h b/net/mpls/internal.h
>> index 2681a4b..9e18b58 100644
>> --- a/net/mpls/internal.h
>> +++ b/net/mpls/internal.h
>> @@ -1,6 +1,17 @@
>>  #ifndef MPLS_INTERNAL_H
>>  #define MPLS_INTERNAL_H
>>  
>> +enum mpls_payload_type {
>> +	MPT_UNSPEC, /* IPv4 or IPv6 */
>> +	MPT_IPV4 = 4,
>> +	MPT_IPV6 = 6,
>> +
>> +	/* Other types not implemented:
>> +	 *  - Pseudo-wire with or without control word (RFC4385)
>> +	 *  - GAL (RFC5586)
>> +	 */
>> +};
>> +
>>  struct mpls_shim_hdr {
>>  	__be32 label_stack_entry;
>>  };
>> @@ -21,6 +32,34 @@ struct mpls_dev {
>>  
>>  struct sk_buff;
>>  
>> +#define LABEL_NOT_SPECIFIED (1 << 20)
>> +#define MAX_NEW_LABELS 2
>> +
>> +/* This maximum ha length copied from the definition of struct neighbour */
>> +#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
>> +
>> +struct mpls_nh {
>> +	struct net_device __rcu *nh_dev;
>> +	u32			nh_label[MAX_NEW_LABELS];
>> +	unsigned int		nh_flags;
>> +	int                     nh_weight;
>> +	int                     nh_power;
>> +	struct list_head	nh_next;
>> +	u8			nh_labels;
>> +	u8			nh_via_alen;
>> +	u8			nh_via_table;
>> +	u8			nh_via[0];
>> +};
>> +
>> +struct mpls_route {
>> +	struct rcu_head		rt_rcu;
>> +	u8			rt_protocol;
>> +	u8			rt_payload_type;
>> +	int                     rt_power;
>> +	int			rt_nhn;
>> +	struct list_head        rt_nhs;
>> +};
>> +
>>  static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
>>  {
>>  	return (struct mpls_shim_hdr *)skb_network_header(skb);
>> @@ -52,8 +91,10 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *
>>  
>>  int nla_put_labels(struct sk_buff *skb, int attrtype,  u8 labels,
>>  		   const u32 label[]);
>> -int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels,
>> +int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels,
>>  		   u32 label[]);
>> +int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table,
>> +		u8 via[]);
>>  bool mpls_output_possible(const struct net_device *dev);
>>  unsigned int mpls_dev_mtu(const struct net_device *dev);
>>  bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu);
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roopa Prabhu Oct. 6, 2015, 8:26 p.m. UTC | #3
On 10/6/15, 12:44 PM, Eric W. Biederman wrote:
> Roopa Prabhu <roopa@cumulusnetworks.com> writes:
>
>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>
>> This patch adds support for MPLS multipath routes.
>>
>> Includes following changes to support multipath:
>> - splits struct mpls_route into 'struct mpls_route + struct mpls_nh'
>>
>> - 'struct mpls_nh' represents a mpls nexthop label forwarding entry
>>
>> - moves mpls route and nexthop structures into internal.h
>>
>> - A mpls_route can point to multiple mpls_nh structs
>>
>> - the nexthops are maintained as a list
> So I am not certain I like nexthops being a list.  In the practical case
> introducing this list guarantees that everyone will see at least an
> extra cache line miss in the forwarding path.
>
> In the more abstract sense a list is the wrong data structure.  If the
> list is so short we can afford to walk it an array is a better data
> structure.  If we need enough entries to make the memory consumption
> of an array a concern we want some kind of hash table or tree data
> structure, because a list will be too long in that case.
>
> So can we please not use a list?
sure, I used arrays the first time. http://marc.info/?l=linux-netdev&m=143932956719398&w=2
And i am very much ok with an array.  I used list in v2 by following the ipv6 fib code following comments from v1.


The only place the lookup is sensitive is in the nexthop selection in datapath. And depending
on how the selection algorithm works, i am not sure if using a hash table will help there.
I will look though.

I did prefer an array and If you are ok with an array, I will respin.

>
> I expect we can simplify the data structures by noting that rt_via must
> be an ethernet mac today so that 6 bytes are enough and 8 bytes gives us
> a bit extra and aligns things nicely.
>
> Also I know it goes away in the next patch but a spinlock taken for
> every transit through the forwarding path really bugs me.
yes, agree. I picked that from ipv4 fib. since it goes away with Roberts patch I did not spend any time on it.

thanks for the review.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roopa Prabhu Oct. 6, 2015, 8:31 p.m. UTC | #4
On 10/6/15, 1:11 PM, Eric W. Biederman wrote:
> ebiederm@xmission.com (Eric W. Biederman) writes:
>
>> Roopa Prabhu <roopa@cumulusnetworks.com> writes:
>>
>>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>>
>>> This patch adds support for MPLS multipath routes.
>>>
>>> Includes following changes to support multipath:
>>> - splits struct mpls_route into 'struct mpls_route + struct mpls_nh'
>>>
>>> - 'struct mpls_nh' represents a mpls nexthop label forwarding entry
>>>
>>> - moves mpls route and nexthop structures into internal.h
>>>
>>> - A mpls_route can point to multiple mpls_nh structs
>>>
>>> - the nexthops are maintained as a list
>> So I am not certain I like nexthops being a list.  In the practical case
>> introducing this list guarantees that everyone will see at least an
>> extra cache line miss in the forwarding path.
>>
>> In the more abstract sense a list is the wrong data structure.  If the
>> list is so short we can afford to walk it an array is a better data
>> structure.  If we need enough entries to make the memory consumption
>> of an array a concern we want some kind of hash table or tree data
>> structure, because a list will be too long in that case.
>>
>> So can we please not use a list?
>>
>> I expect we can simplify the data structures by noting that rt_via must
>> be an ethernet mac today so that 6 bytes are enough and 8 bytes gives us
>> a bit extra and aligns things nicely.
> Grr. My mistake.  The current worst case is 16 bytes for an ipv6
> address in rt_via.  But the point remains that a fixed sized array of
> bytes in rt_via allows the use of an array and not a list for nexthops.
>
> At least for the single nexthop case I really want something that is
> small enough it fits in a single 64byte cache line.  The performance
> compared to anything else is going to be noticable.
>
agree. Just responded to your last email. I moved from array to list only because of the extra bytes.
I would prefer an array too.

http://marc.info/?l=linux-netdev&m=143932956719398&w=2

or

https://patchwork.ozlabs.org/patch/506226/


link to full series is here: http://marc.info/?l=linux-netdev&m=143932955919395&w=2

thanks,
Roopa

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Oct. 7, 2015, 3:38 a.m. UTC | #5
roopa <roopa@cumulusnetworks.com> writes:

> On 10/6/15, 12:44 PM, Eric W. Biederman wrote:
>> Roopa Prabhu <roopa@cumulusnetworks.com> writes:
>>
>>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>>
>>> This patch adds support for MPLS multipath routes.
>>>
>>> Includes following changes to support multipath:
>>> - splits struct mpls_route into 'struct mpls_route + struct mpls_nh'
>>>
>>> - 'struct mpls_nh' represents a mpls nexthop label forwarding entry
>>>
>>> - moves mpls route and nexthop structures into internal.h
>>>
>>> - A mpls_route can point to multiple mpls_nh structs
>>>
>>> - the nexthops are maintained as a list
>> So I am not certain I like nexthops being a list.  In the practical case
>> introducing this list guarantees that everyone will see at least an
>> extra cache line miss in the forwarding path.
>>
>> In the more abstract sense a list is the wrong data structure.  If the
>> list is so short we can afford to walk it an array is a better data
>> structure.  If we need enough entries to make the memory consumption
>> of an array a concern we want some kind of hash table or tree data
>> structure, because a list will be too long in that case.
>>
>> So can we please not use a list?
> sure, I used arrays the first time. http://marc.info/?l=linux-netdev&m=143932956719398&w=2
> And i am very much ok with an array.  I used list in v2 by following the ipv6 fib code following comments from v1.
>
>
> The only place the lookup is sensitive is in the nexthop selection in datapath. And depending
> on how the selection algorithm works, i am not sure if using a hash table will help there.
> I will look though.
>
> I did prefer an array and If you are ok with an array, I will respin.

Please.  And let's cut out any fields we are not using yet.  If nothing
else lean and mean keeps this code more understandable and reviewable as
at the end of the day there is less of it.

>> I expect we can simplify the data structures by noting that rt_via must
>> be an ethernet mac today so that 6 bytes are enough and 8 bytes gives us
>> a bit extra and aligns things nicely.
>>
>> Also I know it goes away in the next patch but a spinlock taken for
>> every transit through the forwarding path really bugs me.
> yes, agree. I picked that from ipv4 fib. since it goes away with Roberts patch I did not spend any time on it.
>
> thanks for the review.

Eric
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
index 4757997..179253f 100644
--- a/include/net/mpls_iptunnel.h
+++ b/include/net/mpls_iptunnel.h
@@ -18,7 +18,7 @@ 
 
 struct mpls_iptunnel_encap {
 	u32	label[MAX_NEW_LABELS];
-	u32	labels;
+	u8	labels;
 };
 
 static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate)
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 8c5707d..ae9e153 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -19,39 +19,12 @@ 
 #include <net/ipv6.h>
 #include <net/addrconf.h>
 #endif
+#include <net/nexthop.h>
 #include "internal.h"
 
-#define LABEL_NOT_SPECIFIED (1<<20)
-#define MAX_NEW_LABELS 2
-
-/* This maximum ha length copied from the definition of struct neighbour */
-#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
-
-enum mpls_payload_type {
-	MPT_UNSPEC, /* IPv4 or IPv6 */
-	MPT_IPV4 = 4,
-	MPT_IPV6 = 6,
-
-	/* Other types not implemented:
-	 *  - Pseudo-wire with or without control word (RFC4385)
-	 *  - GAL (RFC5586)
-	 */
-};
-
-struct mpls_route { /* next hop label forwarding entry */
-	struct net_device __rcu *rt_dev;
-	struct rcu_head		rt_rcu;
-	u32			rt_label[MAX_NEW_LABELS];
-	u8			rt_protocol; /* routing protocol that set this entry */
-	u8                      rt_payload_type;
-	u8			rt_labels;
-	u8			rt_via_alen;
-	u8			rt_via_table;
-	u8			rt_via[0];
-};
-
 static int zero = 0;
 static int label_limit = (1 << 20) - 1;
+static DEFINE_SPINLOCK(mpls_multipath_lock);
 
 static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
 		       struct nlmsghdr *nlh, struct net *net, u32 portid,
@@ -80,10 +53,10 @@  bool mpls_output_possible(const struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(mpls_output_possible);
 
-static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
+static unsigned int mpls_nh_header_size(const struct mpls_nh *nh)
 {
 	/* The size of the layer 2.5 labels to be added for this route */
-	return rt->rt_labels * sizeof(struct mpls_shim_hdr);
+	return nh->nh_labels * sizeof(struct mpls_shim_hdr);
 }
 
 unsigned int mpls_dev_mtu(const struct net_device *dev)
@@ -105,8 +78,58 @@  bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 }
 EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
 
-static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
-			struct mpls_entry_decoded dec)
+/* This is a cut/copy/modify from fib_select_multipath */
+static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt)
+{
+	struct mpls_nh *nh;
+	struct mpls_nh *ret_nh;
+	int nhsel = 0;
+	int w;
+
+	spin_lock_bh(&mpls_multipath_lock);
+	ret_nh = list_first_entry_or_null(&rt->rt_nhs, struct mpls_nh,
+					  nh_next);
+	if (rt->rt_power <= 0) {
+		int power = 0;
+
+		list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
+			power += nh->nh_weight;
+			nh->nh_power = nh->nh_weight;
+		}
+		rt->rt_power = power;
+		if (power <= 0) {
+			spin_unlock_bh(&mpls_multipath_lock);
+			/* Race condition: route has just become dead. */
+			return ret_nh;
+		}
+	}
+
+	/* w should be random number [0..rt->rt_power-1],
+	 * it is pretty bad approximation.
+	 */
+	w = jiffies % rt->rt_power;
+
+	list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
+		if (nh->nh_power) {
+			w -= nh->nh_power;
+			if (w <= 0) {
+				nh->nh_power--;
+				rt->rt_power--;
+				ret_nh = nh;
+				spin_unlock_bh(&mpls_multipath_lock);
+				return ret_nh;
+			}
+		}
+		nhsel++;
+	}
+
+	/* Race condition: route has just become dead. */
+	spin_unlock_bh(&mpls_multipath_lock);
+	return ret_nh;
+}
+
+static bool mpls_egress(struct mpls_route *rt, struct mpls_nh *nh,
+			struct sk_buff *skb, struct mpls_entry_decoded dec)
 {
 	enum mpls_payload_type payload_type;
 	bool success = false;
@@ -159,6 +182,7 @@  static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	struct net *net = dev_net(dev);
 	struct mpls_shim_hdr *hdr;
 	struct mpls_route *rt;
+	struct mpls_nh *nh;
 	struct mpls_entry_decoded dec;
 	struct net_device *out_dev;
 	struct mpls_dev *mdev;
@@ -196,9 +220,13 @@  static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	if (!rt)
 		goto drop;
 
+	nh = mpls_select_multipath(rt);
+	if (!nh)
+		goto drop;
+
 	/* Find the output device */
-	out_dev = rcu_dereference(rt->rt_dev);
-	if (!mpls_output_possible(out_dev))
+	out_dev = rcu_dereference(nh->nh_dev);
+	if (!out_dev || !mpls_output_possible(out_dev))
 		goto drop;
 
 	if (skb_warn_if_lro(skb))
@@ -212,7 +240,7 @@  static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	dec.ttl -= 1;
 
 	/* Verify the destination can hold the packet */
-	new_header_size = mpls_rt_header_size(rt);
+	new_header_size = mpls_nh_header_size(nh);
 	mtu = mpls_dev_mtu(out_dev);
 	if (mpls_pkt_too_big(skb, mtu - new_header_size))
 		goto drop;
@@ -230,7 +258,7 @@  static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 
 	if (unlikely(!new_header_size && dec.bos)) {
 		/* Penultimate hop popping */
-		if (!mpls_egress(rt, skb, dec))
+		if (!mpls_egress(rt, nh, skb, dec))
 			goto drop;
 	} else {
 		bool bos;
@@ -240,13 +268,14 @@  static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 		/* Push the new labels */
 		hdr = mpls_hdr(skb);
 		bos = dec.bos;
-		for (i = rt->rt_labels - 1; i >= 0; i--) {
-			hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, bos);
+		for (i = nh->nh_labels - 1; i >= 0; i--) {
+			hdr[i] = mpls_entry_encode(nh->nh_label[i],
+						   dec.ttl, 0, bos);
 			bos = false;
 		}
 	}
 
-	err = neigh_xmit(rt->rt_via_table, out_dev, rt->rt_via, skb);
+	err = neigh_xmit(nh->nh_via_table, out_dev, nh->nh_via, skb);
 	if (err)
 		net_dbg_ratelimited("%s: packet transmission failed: %d\n",
 				    __func__, err);
@@ -270,31 +299,43 @@  static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = {
 struct mpls_route_config {
 	u32			rc_protocol;
 	u32			rc_ifindex;
-	u16			rc_via_table;
-	u16			rc_via_alen;
+	u8			rc_via_table;
+	u8			rc_via_alen;
 	u8			rc_via[MAX_VIA_ALEN];
+	u8			rc_output_labels;
 	u32			rc_label;
-	u32			rc_output_labels;
 	u32			rc_output_label[MAX_NEW_LABELS];
 	u32			rc_nlflags;
 	enum mpls_payload_type	rc_payload_type;
 	struct nl_info		rc_nlinfo;
+	struct rtnexthop	*rc_mp;
+	int			rc_mp_len;
 };
 
-static struct mpls_route *mpls_rt_alloc(size_t alen)
+static struct mpls_route *mpls_rt_alloc(int num_nh)
 {
 	struct mpls_route *rt;
 
-	rt = kzalloc(sizeof(*rt) + alen, GFP_KERNEL);
-	if (rt)
-		rt->rt_via_alen = alen;
+	rt = kzalloc(sizeof(*rt), GFP_KERNEL);
+	if (rt) {
+		rt->rt_nhn = num_nh;
+		INIT_LIST_HEAD(&rt->rt_nhs);
+	}
+
 	return rt;
 }
 
 static void mpls_rt_free(struct mpls_route *rt)
 {
-	if (rt)
+	struct mpls_nh *nh, *nh_safe;
+
+	if (rt) {
+		list_for_each_entry_safe(nh, nh_safe, &rt->rt_nhs, nh_next) {
+			list_del(&nh->nh_next);
+			kfree(nh);
+		}
 		kfree_rcu(rt, rt_rcu);
+	}
 }
 
 static void mpls_notify_route(struct net *net, unsigned index,
@@ -312,25 +353,22 @@  static void mpls_notify_route(struct net *net, unsigned index,
 }
 
 static void mpls_route_update(struct net *net, unsigned index,
-			      struct net_device *dev, struct mpls_route *new,
+			      struct mpls_route *new,
 			      const struct nl_info *info)
 {
 	struct mpls_route __rcu **platform_label;
-	struct mpls_route *rt, *old = NULL;
+	struct mpls_route *rt;
 
 	ASSERT_RTNL();
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
 	rt = rtnl_dereference(platform_label[index]);
-	if (!dev || (rt && (rtnl_dereference(rt->rt_dev) == dev))) {
-		rcu_assign_pointer(platform_label[index], new);
-		old = rt;
-	}
+	rcu_assign_pointer(platform_label[index], new);
 
-	mpls_notify_route(net, index, old, new, info);
+	mpls_notify_route(net, index, rt, new, info);
 
 	/* If we removed a route free it now */
-	mpls_rt_free(old);
+	mpls_rt_free(rt);
 }
 
 static unsigned find_free_label(struct net *net)
@@ -406,23 +444,23 @@  static struct net_device *inet6_fib_lookup_dev(struct net *net, void *addr)
 #endif
 
 static struct net_device *find_outdev(struct net *net,
-				      struct mpls_route_config *cfg)
+				      struct mpls_nh *nh, int oif)
 {
 	struct net_device *dev = NULL;
 
-	if (!cfg->rc_ifindex) {
-		switch (cfg->rc_via_table) {
+	if (!oif) {
+		switch (nh->nh_via_table) {
 		case NEIGH_ARP_TABLE:
-			dev = inet_fib_lookup_dev(net, cfg->rc_via);
+			dev = inet_fib_lookup_dev(net, nh->nh_via);
 			break;
 		case NEIGH_ND_TABLE:
-			dev = inet6_fib_lookup_dev(net, cfg->rc_via);
+			dev = inet6_fib_lookup_dev(net, nh->nh_via);
 			break;
 		case NEIGH_LINK_TABLE:
 			break;
 		}
 	} else {
-		dev = dev_get_by_index(net, cfg->rc_ifindex);
+		dev = dev_get_by_index(net, oif);
 	}
 
 	if (!dev)
@@ -431,15 +469,208 @@  static struct net_device *find_outdev(struct net *net,
 	return dev;
 }
 
+static int mpls_nh_assign_dev(struct net *net, struct mpls_nh *nh, int oif)
+{
+	struct net_device *dev = NULL;
+	int err = -ENODEV;
+
+	dev = find_outdev(net, nh, oif);
+	if (IS_ERR(dev)) {
+		err = PTR_ERR(dev);
+		dev = NULL;
+		goto errout;
+	}
+
+	/* Ensure this is a supported device */
+	err = -EINVAL;
+	if (!mpls_dev_get(dev))
+		goto errout;
+
+	RCU_INIT_POINTER(nh->nh_dev, dev);
+	dev_put(dev);
+
+	return 0;
+
+errout:
+	if (dev)
+		dev_put(dev);
+	return err;
+}
+
+static struct mpls_nh *mpls_nh_alloc(size_t alen)
+{
+	struct mpls_nh *nh;
+
+	nh = kzalloc(sizeof(*nh) + alen, GFP_KERNEL);
+	if (nh)
+		nh->nh_via_alen = alen;
+
+	return nh;
+}
+
+static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg,
+				  struct mpls_route *rt)
+{
+	struct net *net = cfg->rc_nlinfo.nl_net;
+	struct mpls_nh *nh = NULL;
+	int err;
+	int i;
+
+	err = -EINVAL;
+	/* Ensure only a supported number of labels are present */
+	if (cfg->rc_output_labels > MAX_NEW_LABELS)
+		goto errout;
+
+	err = -ENOMEM;
+	nh = mpls_nh_alloc(cfg->rc_via_alen);
+	if (!nh)
+		goto errout;
+
+	nh->nh_labels = cfg->rc_output_labels;
+	for (i = 0; i < nh->nh_labels; i++)
+		nh->nh_label[i] = cfg->rc_output_label[i];
+
+	nh->nh_via_table = cfg->rc_via_table;
+	memcpy(nh->nh_via, cfg->rc_via, cfg->rc_via_alen);
+	nh->nh_via_alen = cfg->rc_via_alen;
+
+	err = mpls_nh_assign_dev(net, nh, cfg->rc_ifindex);
+	if (err)
+		goto errout;
+
+	list_add_tail(&nh->nh_next, &rt->rt_nhs);
+
+	return 0;
+
+errout:
+	kfree(nh);
+
+	return err;
+}
+
+static int mpls_nh_build(struct net *net, struct mpls_nh **rt_nh,
+			 int oif, struct nlattr *via_attr,
+			 struct nlattr *newdst)
+{
+	struct mpls_nh *nh = NULL;
+	int err;
+	u8 via_alen;
+	u8 via_table;
+	u8 via[MAX_VIA_ALEN];
+
+	err = nla_get_via(via_attr, &via_alen, &via_table,
+			  via);
+	if (err)
+		goto errout;
+
+	nh = mpls_nh_alloc(via_alen);
+	if (!nh)
+		goto errout;
+
+	if (newdst) {
+		err = nla_get_labels(newdst, MAX_NEW_LABELS,
+				     &nh->nh_labels, nh->nh_label);
+		if (err)
+			goto errout;
+	}
+	nh->nh_via_table = via_table;
+	memcpy(nh->nh_via, via, via_alen);
+
+	err = mpls_nh_assign_dev(net, nh, oif);
+	if (err)
+		goto errout;
+
+	*rt_nh = nh;
+
+	return 0;
+
+errout:
+	kfree(nh);
+
+	return err;
+}
+
+static int mpls_count_nexthops(struct rtnexthop *rtnh, int len)
+{
+	int nhs = 0;
+	int remaining = len;
+
+	while (rtnh_ok(rtnh, remaining)) {
+		nhs++;
+		rtnh = rtnh_next(rtnh, &remaining);
+	}
+
+	/* leftover implies invalid nexthop configuration, discard it */
+	return remaining > 0 ? 0 : nhs;
+}
+
+static int mpls_nh_build_multi(struct mpls_route_config *cfg,
+			       struct mpls_route *rt)
+{
+	struct rtnexthop *rtnh = cfg->rc_mp;
+	struct nlattr *nla_via, *nla_newdst;
+	int remaining = cfg->rc_mp_len;
+	struct mpls_nh *nh, *nh_safe;
+	int nhs = 0;
+	int err = 0;
+
+	while (rtnh_ok(rtnh, remaining)) {
+		int attrlen;
+
+		nla_via = NULL;
+		nla_newdst = NULL;
+		nh = NULL;
+
+		err = -EINVAL;
+		if (!rtnh_ok(rtnh, remaining))
+			goto errout;
+
+		attrlen = rtnh_attrlen(rtnh);
+		if (attrlen > 0) {
+			struct nlattr *attrs = rtnh_attrs(rtnh);
+
+			nla_via = nla_find(attrs, attrlen, RTA_VIA);
+			nla_newdst = nla_find(attrs, attrlen, RTA_NEWDST);
+		}
+
+		err = -EINVAL;
+		if (!nla_via)
+			goto errout;
+
+		err = mpls_nh_build(cfg->rc_nlinfo.nl_net, &nh,
+				    rtnh->rtnh_ifindex, nla_via,
+				    nla_newdst);
+		if (err)
+			goto errout;
+
+		nh->nh_weight = rtnh->rtnh_hops + 1;
+		list_add_tail(&nh->nh_next, &rt->rt_nhs);
+
+		rtnh = rtnh_next(rtnh, &remaining);
+		nhs++;
+	}
+
+	rt->rt_nhn = nhs;
+
+	return 0;
+
+errout:
+	list_for_each_entry_safe(nh, nh_safe, &rt->rt_nhs, nh_next) {
+		list_del(&nh->nh_next);
+		kfree(nh);
+	}
+
+	return err;
+}
+
 static int mpls_route_add(struct mpls_route_config *cfg)
 {
 	struct mpls_route __rcu **platform_label;
 	struct net *net = cfg->rc_nlinfo.nl_net;
-	struct net_device *dev = NULL;
 	struct mpls_route *rt, *old;
-	unsigned index;
-	int i;
 	int err = -EINVAL;
+	unsigned index;
+	int nhs = 1; /* default to one nexthop */
 
 	index = cfg->rc_label;
 
@@ -457,27 +688,6 @@  static int mpls_route_add(struct mpls_route_config *cfg)
 	if (index >= net->mpls.platform_labels)
 		goto errout;
 
-	/* Ensure only a supported number of labels are present */
-	if (cfg->rc_output_labels > MAX_NEW_LABELS)
-		goto errout;
-
-	dev = find_outdev(net, cfg);
-	if (IS_ERR(dev)) {
-		err = PTR_ERR(dev);
-		dev = NULL;
-		goto errout;
-	}
-
-	/* Ensure this is a supported device */
-	err = -EINVAL;
-	if (!mpls_dev_get(dev))
-		goto errout;
-
-	err = -EINVAL;
-	if ((cfg->rc_via_table == NEIGH_LINK_TABLE) &&
-	    (dev->addr_len != cfg->rc_via_alen))
-		goto errout;
-
 	/* Append makes no sense with mpls */
 	err = -EOPNOTSUPP;
 	if (cfg->rc_nlflags & NLM_F_APPEND)
@@ -497,28 +707,34 @@  static int mpls_route_add(struct mpls_route_config *cfg)
 	if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old)
 		goto errout;
 
+	if (cfg->rc_mp) {
+		err = -EINVAL;
+		nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len);
+		if (nhs == 0)
+			goto errout;
+	}
+
 	err = -ENOMEM;
-	rt = mpls_rt_alloc(cfg->rc_via_alen);
+	rt = mpls_rt_alloc(nhs);
 	if (!rt)
 		goto errout;
-
-	rt->rt_labels = cfg->rc_output_labels;
-	for (i = 0; i < rt->rt_labels; i++)
-		rt->rt_label[i] = cfg->rc_output_label[i];
 	rt->rt_protocol = cfg->rc_protocol;
-	RCU_INIT_POINTER(rt->rt_dev, dev);
 	rt->rt_payload_type = cfg->rc_payload_type;
-	rt->rt_via_table = cfg->rc_via_table;
-	memcpy(rt->rt_via, cfg->rc_via, cfg->rc_via_alen);
 
-	mpls_route_update(net, index, NULL, rt, &cfg->rc_nlinfo);
+	if (cfg->rc_mp)
+		err = mpls_nh_build_multi(cfg, rt);
+	else
+		err = mpls_nh_build_from_cfg(cfg, rt);
+	if (err)
+		goto freert;
+
+	mpls_route_update(net, index, rt, &cfg->rc_nlinfo);
 
-	dev_put(dev);
 	return 0;
 
+freert:
+	mpls_rt_free(rt);
 errout:
-	if (dev)
-		dev_put(dev);
 	return err;
 }
 
@@ -538,7 +754,7 @@  static int mpls_route_del(struct mpls_route_config *cfg)
 	if (index >= net->mpls.platform_labels)
 		goto errout;
 
-	mpls_route_update(net, index, NULL, NULL, &cfg->rc_nlinfo);
+	mpls_route_update(net, index, NULL, &cfg->rc_nlinfo);
 
 	err = 0;
 errout:
@@ -628,6 +844,7 @@  static void mpls_ifdown(struct net_device *dev)
 	struct mpls_route __rcu **platform_label;
 	struct net *net = dev_net(dev);
 	struct mpls_dev *mdev;
+	struct mpls_nh *nh;
 	unsigned index;
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
@@ -635,9 +852,14 @@  static void mpls_ifdown(struct net_device *dev)
 		struct mpls_route *rt = rtnl_dereference(platform_label[index]);
 		if (!rt)
 			continue;
-		if (rtnl_dereference(rt->rt_dev) != dev)
-			continue;
-		rt->rt_dev = NULL;
+		list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
+			struct net_device *mdev;
+
+			mdev = rtnl_dereference(nh->nh_dev);
+			if (mdev != dev)
+				continue;
+			nh->nh_dev = NULL;
+		}
 	}
 
 	mdev = mpls_dev_get(dev);
@@ -736,7 +958,7 @@  int nla_put_labels(struct sk_buff *skb, int attrtype,
 EXPORT_SYMBOL_GPL(nla_put_labels);
 
 int nla_get_labels(const struct nlattr *nla,
-		   u32 max_labels, u32 *labels, u32 label[])
+		   u8 max_labels, u8 *labels, u32 label[])
 {
 	unsigned len = nla_len(nla);
 	unsigned nla_labels;
@@ -781,6 +1003,48 @@  int nla_get_labels(const struct nlattr *nla,
 }
 EXPORT_SYMBOL_GPL(nla_get_labels);
 
+int nla_get_via(const struct nlattr *nla, u8 *via_alen,
+		u8 *via_table, u8 via_addr[])
+{
+	struct rtvia *via = nla_data(nla);
+	int err = -EINVAL;
+	u8 alen;
+
+	if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr))
+		goto errout;
+	alen = nla_len(nla) -
+			offsetof(struct rtvia, rtvia_addr);
+	if (alen > MAX_VIA_ALEN)
+		goto errout;
+
+	/* Validate the address family */
+	switch (via->rtvia_family) {
+	case AF_PACKET:
+		*via_table = NEIGH_LINK_TABLE;
+		break;
+	case AF_INET:
+		*via_table = NEIGH_ARP_TABLE;
+		if (alen != 4)
+			goto errout;
+		break;
+	case AF_INET6:
+		*via_table = NEIGH_ND_TABLE;
+		if (alen != 16)
+			goto errout;
+		break;
+	default:
+		/* Unsupported address family */
+		goto errout;
+	}
+
+	memcpy(via_addr, via->rtvia_addr, alen);
+	*via_alen = alen;
+	err = 0;
+
+errout:
+	return err;
+}
+
 static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
 			       struct mpls_route_config *cfg)
 {
@@ -844,7 +1108,7 @@  static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
 			break;
 		case RTA_DST:
 		{
-			u32 label_count;
+			u8 label_count;
 			if (nla_get_labels(nla, 1, &label_count,
 					   &cfg->rc_label))
 				goto errout;
@@ -857,35 +1121,15 @@  static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
 		}
 		case RTA_VIA:
 		{
-			struct rtvia *via = nla_data(nla);
-			if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr))
+			if (nla_get_via(nla, &cfg->rc_via_alen,
+					&cfg->rc_via_table, cfg->rc_via))
 				goto errout;
-			cfg->rc_via_alen   = nla_len(nla) -
-				offsetof(struct rtvia, rtvia_addr);
-			if (cfg->rc_via_alen > MAX_VIA_ALEN)
-				goto errout;
-
-			/* Validate the address family */
-			switch(via->rtvia_family) {
-			case AF_PACKET:
-				cfg->rc_via_table = NEIGH_LINK_TABLE;
-				break;
-			case AF_INET:
-				cfg->rc_via_table = NEIGH_ARP_TABLE;
-				if (cfg->rc_via_alen != 4)
-					goto errout;
-				break;
-			case AF_INET6:
-				cfg->rc_via_table = NEIGH_ND_TABLE;
-				if (cfg->rc_via_alen != 16)
-					goto errout;
-				break;
-			default:
-				/* Unsupported address family */
-				goto errout;
-			}
-
-			memcpy(cfg->rc_via, via->rtvia_addr, cfg->rc_via_alen);
+			break;
+		}
+		case RTA_MULTIPATH:
+		{
+			cfg->rc_mp = nla_data(nla);
+			cfg->rc_mp_len = nla_len(nla);
 			break;
 		}
 		default:
@@ -946,16 +1190,56 @@  static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
 	rtm->rtm_type = RTN_UNICAST;
 	rtm->rtm_flags = 0;
 
-	if (rt->rt_labels &&
-	    nla_put_labels(skb, RTA_NEWDST, rt->rt_labels, rt->rt_label))
-		goto nla_put_failure;
-	if (nla_put_via(skb, rt->rt_via_table, rt->rt_via, rt->rt_via_alen))
-		goto nla_put_failure;
-	dev = rtnl_dereference(rt->rt_dev);
-	if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
-		goto nla_put_failure;
 	if (nla_put_labels(skb, RTA_DST, 1, &label))
 		goto nla_put_failure;
+	if (rt->rt_nhn == 1) {
+		struct mpls_nh *nh = list_first_entry_or_null(&rt->rt_nhs,
+							struct mpls_nh,
+							nh_next);
+
+		if (nh->nh_labels &&
+		    nla_put_labels(skb, RTA_NEWDST, nh->nh_labels,
+				   nh->nh_label))
+			goto nla_put_failure;
+		if (nla_put_via(skb, nh->nh_via_table, nh->nh_via,
+				nh->nh_via_alen))
+			goto nla_put_failure;
+		dev = rtnl_dereference(nh->nh_dev);
+		if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
+			goto nla_put_failure;
+	} else {
+		struct rtnexthop *rtnh;
+		struct nlattr *mp;
+		struct mpls_nh *nh;
+
+		mp = nla_nest_start(skb, RTA_MULTIPATH);
+		if (!mp)
+			goto nla_put_failure;
+
+		list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
+			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
+			if (!rtnh)
+				goto nla_put_failure;
+
+			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
+			dev = rtnl_dereference(nh->nh_dev);
+			if (dev)
+				rtnh->rtnh_ifindex = dev->ifindex;
+			if (nh->nh_labels &&
+			    nla_put_labels(skb, RTA_NEWDST, nh->nh_labels,
+					   nh->nh_label))
+				goto nla_put_failure;
+			if (nla_put_via(skb, nh->nh_via_table,
+					nh->nh_via,
+					nh->nh_via_alen))
+				goto nla_put_failure;
+
+			/* length of rtnetlink header + attributes */
+			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
+		}
+
+		nla_nest_end(skb, mp);
+	}
 
 	nlmsg_end(skb, nlh);
 	return 0;
@@ -1000,12 +1284,34 @@  static inline size_t lfib_nlmsg_size(struct mpls_route *rt)
 {
 	size_t payload =
 		NLMSG_ALIGN(sizeof(struct rtmsg))
-		+ nla_total_size(2 + rt->rt_via_alen)	/* RTA_VIA */
 		+ nla_total_size(4);			/* RTA_DST */
-	if (rt->rt_labels)				/* RTA_NEWDST */
-		payload += nla_total_size(rt->rt_labels * 4);
-	if (rt->rt_dev)					/* RTA_OIF */
-		payload += nla_total_size(4);
+
+	if (rt->rt_nhn == 1) {
+		struct mpls_nh *nh = list_first_entry_or_null(&rt->rt_nhs,
+							      struct mpls_nh,
+							      nh_next);
+
+		if (nh->nh_dev)
+			payload += nla_total_size(4); /* RTA_OIF */
+		payload += nla_total_size(2 + nh->nh_via_alen); /* RTA_VIA */
+		if (nh->nh_labels) /* RTA_NEWDST */
+			payload += nla_total_size(nh->nh_labels * 4);
+	} else {
+		struct mpls_nh *nh;
+		/* each nexthop is packed in an attribute */
+		size_t nhsize = 0;
+
+		list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
+			nhsize += nla_total_size(sizeof(struct rtnexthop)) +
+					nla_total_size(nh->nh_via_alen +
+						       2); /* RTA_VIA */
+			if (nh->nh_labels) /* RTA_NEWDST */
+				nhsize += nla_total_size(nh->nh_labels * 4);
+		}
+		/* nested attribute */
+		payload += nla_total_size(nhsize);
+	}
+
 	return payload;
 }
 
@@ -1057,25 +1363,37 @@  static int resize_platform_label_table(struct net *net, size_t limit)
 	/* In case the predefined labels need to be populated */
 	if (limit > MPLS_LABEL_IPV4NULL) {
 		struct net_device *lo = net->loopback_dev;
-		rt0 = mpls_rt_alloc(lo->addr_len);
+		struct mpls_nh *nh;
+
+		rt0 = mpls_rt_alloc(1);
 		if (!rt0)
 			goto nort0;
-		RCU_INIT_POINTER(rt0->rt_dev, lo);
 		rt0->rt_protocol = RTPROT_KERNEL;
 		rt0->rt_payload_type = MPT_IPV4;
-		rt0->rt_via_table = NEIGH_LINK_TABLE;
-		memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len);
+		nh = mpls_nh_alloc(lo->addr_len);
+		if (!nh)
+			goto nort2;
+		RCU_INIT_POINTER(nh->nh_dev, lo);
+		nh->nh_via_table = NEIGH_LINK_TABLE;
+		memcpy(nh->nh_via, lo->dev_addr, lo->addr_len);
+		list_add_tail(&nh->nh_next, &rt0->rt_nhs);
 	}
 	if (limit > MPLS_LABEL_IPV6NULL) {
 		struct net_device *lo = net->loopback_dev;
-		rt2 = mpls_rt_alloc(lo->addr_len);
+		struct mpls_nh *nh;
+
+		rt2 = mpls_rt_alloc(1);
 		if (!rt2)
 			goto nort2;
-		RCU_INIT_POINTER(rt2->rt_dev, lo);
 		rt2->rt_protocol = RTPROT_KERNEL;
 		rt2->rt_payload_type = MPT_IPV6;
-		rt2->rt_via_table = NEIGH_LINK_TABLE;
-		memcpy(rt2->rt_via, lo->dev_addr, lo->addr_len);
+		nh = mpls_nh_alloc(lo->addr_len);
+		if (!nh)
+			goto nort2;
+		RCU_INIT_POINTER(nh->nh_dev, lo);
+		nh->nh_via_table = NEIGH_LINK_TABLE;
+		memcpy(nh->nh_via, lo->dev_addr, lo->addr_len);
+		list_add_tail(&nh->nh_next, &rt2->rt_nhs);
 	}
 
 	rtnl_lock();
@@ -1085,7 +1403,7 @@  static int resize_platform_label_table(struct net *net, size_t limit)
 
 	/* Free any labels beyond the new table */
 	for (index = limit; index < old_limit; index++)
-		mpls_route_update(net, index, NULL, NULL, NULL);
+		mpls_route_update(net, index, NULL, NULL);
 
 	/* Copy over the old labels */
 	cp_size = size;
@@ -1124,6 +1442,7 @@  static int resize_platform_label_table(struct net *net, size_t limit)
 
 nort2:
 	mpls_rt_free(rt0);
+	mpls_rt_free(rt2);
 nort0:
 	kvfree(labels);
 nolabels:
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 2681a4b..9e18b58 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -1,6 +1,17 @@ 
 #ifndef MPLS_INTERNAL_H
 #define MPLS_INTERNAL_H
 
+enum mpls_payload_type {
+	MPT_UNSPEC, /* IPv4 or IPv6 */
+	MPT_IPV4 = 4,
+	MPT_IPV6 = 6,
+
+	/* Other types not implemented:
+	 *  - Pseudo-wire with or without control word (RFC4385)
+	 *  - GAL (RFC5586)
+	 */
+};
+
 struct mpls_shim_hdr {
 	__be32 label_stack_entry;
 };
@@ -21,6 +32,34 @@  struct mpls_dev {
 
 struct sk_buff;
 
+#define LABEL_NOT_SPECIFIED (1 << 20)
+#define MAX_NEW_LABELS 2
+
+/* This maximum ha length copied from the definition of struct neighbour */
+#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
+
+struct mpls_nh {
+	struct net_device __rcu *nh_dev;
+	u32			nh_label[MAX_NEW_LABELS];
+	unsigned int		nh_flags;
+	int                     nh_weight;
+	int                     nh_power;
+	struct list_head	nh_next;
+	u8			nh_labels;
+	u8			nh_via_alen;
+	u8			nh_via_table;
+	u8			nh_via[0];
+};
+
+struct mpls_route {
+	struct rcu_head		rt_rcu;
+	u8			rt_protocol;
+	u8			rt_payload_type;
+	int                     rt_power;
+	int			rt_nhn;
+	struct list_head        rt_nhs;
+};
+
 static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
 {
 	return (struct mpls_shim_hdr *)skb_network_header(skb);
@@ -52,8 +91,10 @@  static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *
 
 int nla_put_labels(struct sk_buff *skb, int attrtype,  u8 labels,
 		   const u32 label[]);
-int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels,
+int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels,
 		   u32 label[]);
+int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table,
+		u8 via[]);
 bool mpls_output_possible(const struct net_device *dev);
 unsigned int mpls_dev_mtu(const struct net_device *dev);
 bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu);