diff mbox series

[net-next,4/4] ipv6: Add support for non-equal-cost multipath

Message ID 20180109144028.30133-5-idosch@mellanox.com
State Accepted, archived
Delegated to: David Miller
Headers show
Series ipv6: Add support for non-equal-cost multipath | expand

Commit Message

Ido Schimmel Jan. 9, 2018, 2:40 p.m. UTC
The use of hash-threshold instead of modulo-N makes it trivial to add
support for non-equal-cost multipath.

Instead of dividing the multipath hash function's output space equally
between the nexthops, each nexthop is assigned a region size which is
proportional to its weight.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 include/net/ip6_fib.h |  1 +
 net/ipv6/route.c      | 11 +++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

Comments

David Ahern Jan. 10, 2018, 3:48 a.m. UTC | #1
On 1/9/18 7:40 AM, Ido Schimmel wrote:
> The use of hash-threshold instead of modulo-N makes it trivial to add
> support for non-equal-cost multipath.
> 
> Instead of dividing the multipath hash function's output space equally
> between the nexthops, each nexthop is assigned a region size which is
> proportional to its weight.
> 
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> ---
>  include/net/ip6_fib.h |  1 +
>  net/ipv6/route.c      | 11 +++++++----
>  2 files changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
> index 97cd05d87780..34ec321d6a03 100644
> --- a/include/net/ip6_fib.h
> +++ b/include/net/ip6_fib.h
> @@ -171,6 +171,7 @@ struct rt6_info {
>  	u32				rt6i_metric;
>  	u32				rt6i_pmtu;
>  	/* more non-fragment space at head required */
> +	int				rt6i_nh_weight;
>  	unsigned short			rt6i_nfheader_len;
>  	u8				rt6i_protocol;
>  	u8				exception_bucket_flushed:1,

Since dst is cacheline aligned there is a hole after rt6i_nh_flags. In
patch 1 you put rt6i_nh_upper_bound in that hole. Putting the weight
there too keeps those variables together as well as using the open space.
Ido Schimmel Jan. 10, 2018, 11:47 a.m. UTC | #2
Hi David,

On Tue, Jan 09, 2018 at 08:48:37PM -0700, David Ahern wrote:
> On 1/9/18 7:40 AM, Ido Schimmel wrote:
> > The use of hash-threshold instead of modulo-N makes it trivial to add
> > support for non-equal-cost multipath.
> > 
> > Instead of dividing the multipath hash function's output space equally
> > between the nexthops, each nexthop is assigned a region size which is
> > proportional to its weight.
> > 
> > Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> > ---
> >  include/net/ip6_fib.h |  1 +
> >  net/ipv6/route.c      | 11 +++++++----
> >  2 files changed, 8 insertions(+), 4 deletions(-)
> > 
> > diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
> > index 97cd05d87780..34ec321d6a03 100644
> > --- a/include/net/ip6_fib.h
> > +++ b/include/net/ip6_fib.h
> > @@ -171,6 +171,7 @@ struct rt6_info {
> >  	u32				rt6i_metric;
> >  	u32				rt6i_pmtu;
> >  	/* more non-fragment space at head required */
> > +	int				rt6i_nh_weight;
> >  	unsigned short			rt6i_nfheader_len;
> >  	u8				rt6i_protocol;
> >  	u8				exception_bucket_flushed:1,
> 
> Since dst is cacheline aligned there is a hole after rt6i_nh_flags. In
> patch 1 you put rt6i_nh_upper_bound in that hole. Putting the weight
> there too keeps those variables together as well as using the open space.

Before patch 1 there's a hole of 4 bytes after rt6i_nh_flags which I use
for rt6i_nh_upper_bound. If I put rt6i_nh_weight there as well, then I
create a 60 bytes hole because the dst needs to be cached aligned.

Since rt6i_nh_weight isn't used in fast-path, I just put it at the end.
David Ahern Jan. 10, 2018, 3:53 p.m. UTC | #3
On 1/10/18 4:47 AM, Ido Schimmel wrote:
> Hi David,
> 
> On Tue, Jan 09, 2018 at 08:48:37PM -0700, David Ahern wrote:
>> On 1/9/18 7:40 AM, Ido Schimmel wrote:
>>> The use of hash-threshold instead of modulo-N makes it trivial to add
>>> support for non-equal-cost multipath.
>>>
>>> Instead of dividing the multipath hash function's output space equally
>>> between the nexthops, each nexthop is assigned a region size which is
>>> proportional to its weight.
>>>
>>> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
>>> ---
>>>  include/net/ip6_fib.h |  1 +
>>>  net/ipv6/route.c      | 11 +++++++----
>>>  2 files changed, 8 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
>>> index 97cd05d87780..34ec321d6a03 100644
>>> --- a/include/net/ip6_fib.h
>>> +++ b/include/net/ip6_fib.h
>>> @@ -171,6 +171,7 @@ struct rt6_info {
>>>  	u32				rt6i_metric;
>>>  	u32				rt6i_pmtu;
>>>  	/* more non-fragment space at head required */
>>> +	int				rt6i_nh_weight;
>>>  	unsigned short			rt6i_nfheader_len;
>>>  	u8				rt6i_protocol;
>>>  	u8				exception_bucket_flushed:1,
>>
>> Since dst is cacheline aligned there is a hole after rt6i_nh_flags. In
>> patch 1 you put rt6i_nh_upper_bound in that hole. Putting the weight
>> there too keeps those variables together as well as using the open space.
> 
> Before patch 1 there's a hole of 4 bytes after rt6i_nh_flags which I use
> for rt6i_nh_upper_bound. If I put rt6i_nh_weight there as well, then I
> create a 60 bytes hole because the dst needs to be cached aligned.
> 
> Since rt6i_nh_weight isn't used in fast-path, I just put it at the end.
> 

Apparently, I was on a 4.14 branch when I ran pahole to dump the layout
of rt6_info. The patch looks good to me.

Acked-by: David Ahern <dsahern@gmail.com>
diff mbox series

Patch

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 97cd05d87780..34ec321d6a03 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -171,6 +171,7 @@  struct rt6_info {
 	u32				rt6i_metric;
 	u32				rt6i_pmtu;
 	/* more non-fragment space at head required */
+	int				rt6i_nh_weight;
 	unsigned short			rt6i_nfheader_len;
 	u8				rt6i_protocol;
 	u8				exception_bucket_flushed:1,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7837b8c754a3..1076ae0ea9d5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2594,6 +2594,7 @@  static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 #endif
 
 	rt->rt6i_metric = cfg->fc_metric;
+	rt->rt6i_nh_weight = 1;
 
 	/* We cannot add true routes via loopback here,
 	   they would result in kernel looping; promote them to reject routes
@@ -3507,11 +3508,11 @@  static int rt6_multipath_total_weight(const struct rt6_info *rt)
 	int total = 0;
 
 	if (!rt6_is_dead(rt))
-		total++;
+		total += rt->rt6i_nh_weight;
 
 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
 		if (!rt6_is_dead(iter))
-			total++;
+			total += iter->rt6i_nh_weight;
 	}
 
 	return total;
@@ -3522,7 +3523,7 @@  static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
 	int upper_bound = -1;
 
 	if (!rt6_is_dead(rt)) {
-		(*weight)++;
+		*weight += rt->rt6i_nh_weight;
 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
 						    total) - 1;
 	}
@@ -4024,6 +4025,8 @@  static int ip6_route_multipath_add(struct fib6_config *cfg,
 			goto cleanup;
 		}
 
+		rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
+
 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
 		if (err) {
 			dst_release_immediate(&rt->dst);
@@ -4246,7 +4249,7 @@  static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
 	if (!rtnh)
 		goto nla_put_failure;
 
-	rtnh->rtnh_hops = 0;
+	rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
 	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
 
 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)