diff mbox

[net-next] bonding: change xmit hash functions to use skb_flow_dissect

Message ID 1366412545-10829-1-git-send-email-nikolay@redhat.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Nikolay Aleksandrov April 19, 2013, 11:02 p.m. UTC
As Eric suggested earlier, bonding hash functions can make good use of
skb_flow_dissect. The old use cases should have the same results, but
there should be good improvement for tunnel users mostly over IPv4.
I've kept the IPv6 address hashing algorithm and thus if a tunnel is
used over IPv6 then the addresses will be the same but there still can be
improvement because the ports from skb_flow_dissect will be mixed in.
This also fixes a problem with protocol == ETH_P_8021Q load balancing.
In case of non-dissectable packet, the algorithms fall back to L2
hashing.

Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
---
 drivers/net/bonding/bond_main.c | 114 ++++++++++++++++++----------------------
 1 file changed, 50 insertions(+), 64 deletions(-)

Comments

Eric Dumazet April 20, 2013, 12:57 a.m. UTC | #1
On Sat, 2013-04-20 at 01:02 +0200, Nikolay Aleksandrov wrote:
> As Eric suggested earlier, bonding hash functions can make good use of
> skb_flow_dissect. The old use cases should have the same results, but
> there should be good improvement for tunnel users mostly over IPv4.
> I've kept the IPv6 address hashing algorithm and thus if a tunnel is
> used over IPv6 then the addresses will be the same but there still can be
> improvement because the ports from skb_flow_dissect will be mixed in.
> This also fixes a problem with protocol == ETH_P_8021Q load balancing.

Are you sure ? we don't look at skb->vlan_tci

> In case of non-dissectable packet, the algorithms fall back to L2
> hashing.
> 
> Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
> ---
>  drivers/net/bonding/bond_main.c | 114 ++++++++++++++++++----------------------
>  1 file changed, 50 insertions(+), 64 deletions(-)
> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 5e22126..722d8c1 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -77,6 +77,7 @@
>  #include <net/net_namespace.h>
>  #include <net/netns/generic.h>
>  #include <net/pkt_sched.h>
> +#include <net/flow_keys.h>
>  #include "bonding.h"
>  #include "bond_3ad.h"
>  #include "bond_alb.h"
> @@ -3271,94 +3272,79 @@ static struct notifier_block bond_netdev_notifier = {
>  

> +
> +	layer4_xor = ntohs(flow.port16[0] ^ flow.port16[1]);
> +
> +	if (skb->protocol == htons(ETH_P_IPV6))
> +		return (layer4_xor ^ bond_ipv6_hash(skb)) % count;
> +	else
> +		return (layer4_xor ^ bond_ipv4_hash(&flow)) % count;
>  }
>  

Not sure its worth doing this test, as IPv6 addresses are mixed already.

So just

hash = (__force u32)flow.ports ^
       (__force u32)keys.dst ^
       (__force u32)keys.src;

hash ^= (hash >> 16);
hash ^= (hash >> 8);

return hash % count;


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nikolay Aleksandrov April 20, 2013, 6:12 a.m. UTC | #2
On 20/04/13 02:57, Eric Dumazet wrote:
> On Sat, 2013-04-20 at 01:02 +0200, Nikolay Aleksandrov wrote:
>> As Eric suggested earlier, bonding hash functions can make good use of
>> skb_flow_dissect. The old use cases should have the same results, but
>> there should be good improvement for tunnel users mostly over IPv4.
>> I've kept the IPv6 address hashing algorithm and thus if a tunnel is
>> used over IPv6 then the addresses will be the same but there still can be
>> improvement because the ports from skb_flow_dissect will be mixed in.
>> This also fixes a problem with protocol == ETH_P_8021Q load balancing.
> 
> Are you sure ? we don't look at skb->vlan_tci
We don't need to look at vlan_tci, the problem was that when we had a
packet with skb->protocol == htons(ETH_P_8021Q) before we wouldn't use
the network headers inside and always fall back to L2 hash which in most
cases is weaker.
When using skb_flow_dissect we avoid that because of the header extraction.
> 
>> In case of non-dissectable packet, the algorithms fall back to L2
>> hashing.
>>
>> Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
>> ---
>>  drivers/net/bonding/bond_main.c | 114 ++++++++++++++++++----------------------
>>  1 file changed, 50 insertions(+), 64 deletions(-)
>>
>> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>> index 5e22126..722d8c1 100644
>> --- a/drivers/net/bonding/bond_main.c
>> +++ b/drivers/net/bonding/bond_main.c
>> @@ -77,6 +77,7 @@
>>  #include <net/net_namespace.h>
>>  #include <net/netns/generic.h>
>>  #include <net/pkt_sched.h>
>> +#include <net/flow_keys.h>
>>  #include "bonding.h"
>>  #include "bond_3ad.h"
>>  #include "bond_alb.h"
>> @@ -3271,94 +3272,79 @@ static struct notifier_block bond_netdev_notifier = {
>>  
> 
>> +
>> +	layer4_xor = ntohs(flow.port16[0] ^ flow.port16[1]);
>> +
>> +	if (skb->protocol == htons(ETH_P_IPV6))
>> +		return (layer4_xor ^ bond_ipv6_hash(skb)) % count;
>> +	else
>> +		return (layer4_xor ^ bond_ipv4_hash(&flow)) % count;
>>  }
>>  
> 
> Not sure its worth doing this test, as IPv6 addresses are mixed already.
> 
> So just
> 
> hash = (__force u32)flow.ports ^
>        (__force u32)keys.dst ^
>        (__force u32)keys.src;
> 
> hash ^= (hash >> 16);
> hash ^= (hash >> 8);
> 
> return hash % count;
> 
Hm, I actually wanted to keep the old hash because it mixes
different parts of the address so to produce the same results as the
original version.
I think that ipv6_addr_hash mixes the whole IPv6 address, as the old
bond version mixes only bits 32-128.
I'd be happy to update it to this version if the bonding maintainers
agree to it, then we'll be able to remove some ugliness :-)

Thanks for the review,
 Nik
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 20, 2013, 3:52 p.m. UTC | #3
On Sat, 2013-04-20 at 08:12 +0200, Nikolay Aleksandrov wrote:
> On 20/04/13 02:57, Eric Dumazet wrote:
> > On Sat, 2013-04-20 at 01:02 +0200, Nikolay Aleksandrov wrote:
> >> As Eric suggested earlier, bonding hash functions can make good use of
> >> skb_flow_dissect. The old use cases should have the same results, but
> >> there should be good improvement for tunnel users mostly over IPv4.
> >> I've kept the IPv6 address hashing algorithm and thus if a tunnel is
> >> used over IPv6 then the addresses will be the same but there still can be
> >> improvement because the ports from skb_flow_dissect will be mixed in.
> >> This also fixes a problem with protocol == ETH_P_8021Q load balancing.
> > 
> > Are you sure ? we don't look at skb->vlan_tci
> We don't need to look at vlan_tci, the problem was that when we had a
> packet with skb->protocol == htons(ETH_P_8021Q) before we wouldn't use
> the network headers inside and always fall back to L2 hash which in most
> cases is weaker.
> When using skb_flow_dissect we avoid that because of the header extraction

What I meant to say is : Most devices used in a bonding setup use
hardware assist vlan tagging, so I doubt it was a real concern.

The real gain is tunneling decapsulation for free.

> > Not sure its worth doing this test, as IPv6 addresses are mixed already.
> > 
> > So just
> > 
> > hash = (__force u32)flow.ports ^
> >        (__force u32)keys.dst ^
> >        (__force u32)keys.src;
> > 
> > hash ^= (hash >> 16);
> > hash ^= (hash >> 8);
> > 
> > return hash % count;
> > 
> Hm, I actually wanted to keep the old hash because it mixes
> different parts of the address so to produce the same results as the
> original version.
> I think that ipv6_addr_hash mixes the whole IPv6 address, as the old
> bond version mixes only bits 32-128.
> I'd be happy to update it to this version if the bonding maintainers
> agree to it, then we'll be able to remove some ugliness :-)
> 

My original idea was to re-use skb_flow_dissect() to keep a more simple
bonding hash implementation, yet using all bits.

The only difference between l23 and l34 is that l23 wont use flow.ports,
but the hash will be the same :

hash = (__force u32)keys.dst ^ (__force u32)keys.src;
hash ^= (hash >> 16);
hash ^= (hash >> 8);
return hash % count;
 

using ntohl() is really not needed here, just xor32 all the different
parts (or xor64 as in the IPv6 hash function), then the final xor16/xor8
has the property that hash % count is well spread.

Sure, the hash is not the original one, but its not in a RFC, is it ?

Also keep in mind to change documentation.

BTW, I was waiting this patch was first pulled on net-next to do this
myself.
 
commit 4394542ca4ec9f28c3c8405063d200b1e7c347d7
bonding: fix l23 and l34 load balancing in forwarding path

Your patch based on current net-next makes things more complex for
David. I do not feel this is urgent stuff, is it ?



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 20, 2013, 4:38 p.m. UTC | #4
On Sat, 2013-04-20 at 08:52 -0700, Eric Dumazet wrote:

> BTW, I was waiting this patch was first pulled on net-next to do this
> myself.
>  

BTW, I was planning to _add_ a new mode, or only change l34 hashing.

Some users of l23 might not want to inspect and go past tunnel
headers...


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nikolay Aleksandrov April 20, 2013, 4:46 p.m. UTC | #5
On 20/04/13 17:52, Eric Dumazet wrote:
> On Sat, 2013-04-20 at 08:12 +0200, Nikolay Aleksandrov wrote:
>> On 20/04/13 02:57, Eric Dumazet wrote:
>>> On Sat, 2013-04-20 at 01:02 +0200, Nikolay Aleksandrov wrote:
>>>> As Eric suggested earlier, bonding hash functions can make good use of
>>>> skb_flow_dissect. The old use cases should have the same results, but
>>>> there should be good improvement for tunnel users mostly over IPv4.
>>>> I've kept the IPv6 address hashing algorithm and thus if a tunnel is
>>>> used over IPv6 then the addresses will be the same but there still can be
>>>> improvement because the ports from skb_flow_dissect will be mixed in.
>>>> This also fixes a problem with protocol == ETH_P_8021Q load balancing.
>>>
>>> Are you sure ? we don't look at skb->vlan_tci
>> We don't need to look at vlan_tci, the problem was that when we had a
>> packet with skb->protocol == htons(ETH_P_8021Q) before we wouldn't use
>> the network headers inside and always fall back to L2 hash which in most
>> cases is weaker.
>> When using skb_flow_dissect we avoid that because of the header extraction
> 
> What I meant to say is : Most devices used in a bonding setup use
> hardware assist vlan tagging, so I doubt it was a real concern.
> 
> The real gain is tunneling decapsulation for free.
> 
Yes, I completely agree, but believe it or not there are arcane cases
which disable HW vlan acceleration and I've seen custom patches to deal
with it. In fact this patch started as an addition of ETH_P_8021Q only,
but then I saw your suggestion and took a look at skb_flow_dissect,
which apparently is perfect for this kind of use.
>>> Not sure its worth doing this test, as IPv6 addresses are mixed already.
>>>
>>> So just
>>>
>>> hash = (__force u32)flow.ports ^
>>>        (__force u32)keys.dst ^
>>>        (__force u32)keys.src;
>>>
>>> hash ^= (hash >> 16);
>>> hash ^= (hash >> 8);
>>>
>>> return hash % count;
>>>
>> Hm, I actually wanted to keep the old hash because it mixes
>> different parts of the address so to produce the same results as the
>> original version.
>> I think that ipv6_addr_hash mixes the whole IPv6 address, as the old
>> bond version mixes only bits 32-128.
>> I'd be happy to update it to this version if the bonding maintainers
>> agree to it, then we'll be able to remove some ugliness :-)
>>
> 
> My original idea was to re-use skb_flow_dissect() to keep a more simple
> bonding hash implementation, yet using all bits.
> 
> The only difference between l23 and l34 is that l23 wont use flow.ports,
> but the hash will be the same :
> 
> hash = (__force u32)keys.dst ^ (__force u32)keys.src;
> hash ^= (hash >> 16);
> hash ^= (hash >> 8);
> return hash % count;
>
Yeah, I understand, I haven't questioned the new hash function and I
already have a similar implementation with bond_flow_hash() inline
helper that uses only 1 hash for both. I will use your suggested hash
function that mixes all bits and will update it.
> 
> using ntohl() is really not needed here, just xor32 all the different
> parts (or xor64 as in the IPv6 hash function), then the final xor16/xor8
> has the property that hash % count is well spread.
> 
> Sure, the hash is not the original one, but its not in a RFC, is it ?
> 
Of course it's not :-) I'd have used a custom hash function that uses
flow only too but I wanted to make it less invasive at first that's why
I kept the old functions.
> Also keep in mind to change documentation.
> 
> BTW, I was waiting this patch was first pulled on net-next to do this
> myself.
>  
If you'd like to submit it, I can drop this one.
> commit 4394542ca4ec9f28c3c8405063d200b1e7c347d7
> bonding: fix l23 and l34 load balancing in forwarding path
> 
> Your patch based on current net-next makes things more complex for
> David. I do not feel this is urgent stuff, is it ?
> 
Nope, it can wait, I was more interested in the review and will make the
necessary changes (hash and docs) and will queue this at a later time.


Nik



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5e22126..722d8c1 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -77,6 +77,7 @@ 
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/pkt_sched.h>
+#include <net/flow_keys.h>
 #include "bonding.h"
 #include "bond_3ad.h"
 #include "bond_alb.h"
@@ -3271,94 +3272,79 @@  static struct notifier_block bond_netdev_notifier = {
 
 /*---------------------------- Hashing Policies -----------------------------*/
 
-/*
- * Hash for the output device based upon layer 2 data
- */
-static int bond_xmit_hash_policy_l2(struct sk_buff *skb, int count)
+/* Ethernet/IPv4/IPv6 hash helpers */
+static inline u32 bond_eth_hash(struct sk_buff *skb)
 {
 	struct ethhdr *data = (struct ethhdr *)skb->data;
 
-	if (skb_headlen(skb) >= offsetof(struct ethhdr, h_proto))
-		return (data->h_dest[5] ^ data->h_source[5]) % count;
+	return data->h_dest[5] ^ data->h_source[5];
+}
 
-	return 0;
+static inline u32 bond_ipv4_hash(struct flow_keys *flow)
+{
+	return ntohl(flow->src ^ flow->dst) & 0xffff;
 }
 
-/*
- * Hash for the output device based upon layer 2 and layer 3 data. If
- * the packet is not IP, fall back on bond_xmit_hash_policy_l2()
- */
-static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count)
+static inline u32 bond_ipv6_hash(struct sk_buff *skb)
 {
-	struct ethhdr *data = (struct ethhdr *)skb->data;
-	struct iphdr *iph;
 	struct ipv6hdr *ipv6h;
-	u32 v6hash;
+	u32 v6hash = 0;
 	__be32 *s, *d;
 
-	if (skb->protocol == htons(ETH_P_IP) &&
-	    skb_network_header_len(skb) >= sizeof(*iph)) {
-		iph = ip_hdr(skb);
-		return ((ntohl(iph->saddr ^ iph->daddr) & 0xffff) ^
-			(data->h_dest[5] ^ data->h_source[5])) % count;
-	} else if (skb->protocol == htons(ETH_P_IPV6) &&
-		   skb_network_header_len(skb) >= sizeof(*ipv6h)) {
+	if (pskb_network_may_pull(skb, sizeof(*ipv6h))) {
 		ipv6h = ipv6_hdr(skb);
 		s = &ipv6h->saddr.s6_addr32[0];
 		d = &ipv6h->daddr.s6_addr32[0];
 		v6hash = (s[1] ^ d[1]) ^ (s[2] ^ d[2]) ^ (s[3] ^ d[3]);
 		v6hash ^= (v6hash >> 24) ^ (v6hash >> 16) ^ (v6hash >> 8);
-		return (v6hash ^ data->h_dest[5] ^ data->h_source[5]) % count;
 	}
 
-	return bond_xmit_hash_policy_l2(skb, count);
+	return v6hash;
 }
 
-/*
- * Hash for the output device based upon layer 3 and layer 4 data. If
- * the packet is a frag or not TCP or UDP, just use layer 3 data.  If it is
- * altogether not IP, fall back on bond_xmit_hash_policy_l2()
+/* Hash for the output device based upon layer 2 data */
+static int bond_xmit_hash_policy_l2(struct sk_buff *skb, int count)
+{
+	if (likely(skb_headlen(skb) >= offsetof(struct ethhdr, h_proto)))
+		return bond_eth_hash(skb) % count;
+
+	return 0;
+}
+
+/* Hash for the output device based upon layer 2 and layer 3 data. If
+ * the packet is not dissectable, fall back on bond_xmit_hash_policy_l2()
+ */
+static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count)
+{
+	struct flow_keys flow;
+
+	if (!skb_flow_dissect(skb, &flow))
+		return bond_xmit_hash_policy_l2(skb, count);
+
+	if (skb->protocol == htons(ETH_P_IPV6))
+		return (bond_ipv6_hash(skb) ^ bond_eth_hash(skb)) % count;
+	else
+		return (bond_ipv4_hash(&flow) ^ bond_eth_hash(skb)) % count;
+}
+
+/* Hash for the output device based upon layer 3 and layer 4 data. If
+ * the packet is not TCP or UDP, just use layer 3 data.  If it is
+ * altogether not dissectable, fall back on bond_xmit_hash_policy_l2()
  */
 static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
 {
+	struct flow_keys flow;
 	u32 layer4_xor = 0;
-	struct iphdr *iph;
-	struct ipv6hdr *ipv6h;
-	__be32 *s, *d;
-	__be16 *layer4hdr;
-
-	if (skb->protocol == htons(ETH_P_IP) &&
-	    skb_network_header_len(skb) >= sizeof(*iph)) {
-		iph = ip_hdr(skb);
-		if (!ip_is_fragment(iph) &&
-		    (iph->protocol == IPPROTO_TCP ||
-		     iph->protocol == IPPROTO_UDP) &&
-		    (skb_headlen(skb) - skb_network_offset(skb) >=
-		     iph->ihl * sizeof(u32) + sizeof(*layer4hdr) * 2)) {
-			layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
-			layer4_xor = ntohs(*layer4hdr ^ *(layer4hdr + 1));
-		}
-		return (layer4_xor ^
-			((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
-	} else if (skb->protocol == htons(ETH_P_IPV6) &&
-		   skb_network_header_len(skb) >= sizeof(*ipv6h)) {
-		ipv6h = ipv6_hdr(skb);
-		if ((ipv6h->nexthdr == IPPROTO_TCP ||
-		     ipv6h->nexthdr == IPPROTO_UDP) &&
-		    (skb_headlen(skb) - skb_network_offset(skb) >=
-		     sizeof(*ipv6h) + sizeof(*layer4hdr) * 2)) {
-			layer4hdr = (__be16 *)(ipv6h + 1);
-			layer4_xor = ntohs(*layer4hdr ^ *(layer4hdr + 1));
-		}
-		s = &ipv6h->saddr.s6_addr32[0];
-		d = &ipv6h->daddr.s6_addr32[0];
-		layer4_xor ^= (s[1] ^ d[1]) ^ (s[2] ^ d[2]) ^ (s[3] ^ d[3]);
-		layer4_xor ^= (layer4_xor >> 24) ^ (layer4_xor >> 16) ^
-			       (layer4_xor >> 8);
-		return layer4_xor % count;
-	}
 
-	return bond_xmit_hash_policy_l2(skb, count);
+	if (!skb_flow_dissect(skb, &flow))
+		return bond_xmit_hash_policy_l2(skb, count);
+
+	layer4_xor = ntohs(flow.port16[0] ^ flow.port16[1]);
+
+	if (skb->protocol == htons(ETH_P_IPV6))
+		return (layer4_xor ^ bond_ipv6_hash(skb)) % count;
+	else
+		return (layer4_xor ^ bond_ipv4_hash(&flow)) % count;
 }
 
 /*-------------------------- Device entry points ----------------------------*/