diff mbox

bonding: L2L3 xmit doesn't support IPv6

Message ID 20111012025137.GB20605@gospo.rdu.redhat.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Andy Gospodarek Oct. 12, 2011, 2:51 a.m. UTC
On Tue, Oct 11, 2011 at 08:58:59AM -0700, Jay Vosburgh wrote:
> Andy Gospodarek <andy@greyhouse.net> wrote:
[...]
> >
> >There have been some attempts to add support for ipv6 hashing this in
> >the past, but none have been committed.  The best one I had seen was one
> >that did some extensive testing one a wide variety of ipv6 traffic and
> >it showed nice traffic distribution.  I'm not sure if it was ever posted
> >upstream, so I will see if I can dig it up.
> >
> >Can you quantify how traffic was distributed with this algorithm?
> 
> 	As I recall, the IPv6 issues had to do with the "layer3+4" hash,
> because the IPv6 TCP or UDP port numbers can be harder to get at than in
> IPv4 (which typically has a fixed size header).  The above is just for
> layer 2, so it only hits the IPv6 addresses, which don't move around.
> 
> 	That said, I believe that many IPv6 addresses are derived from
> the MAC address, the autoconf addresses in particular, so s6_addr32[3]
> may not show a lot more variation than just the MAC address.  I don't
> know for sure though, since I haven't tested it.
> 
> 	I don't recall seeing the patch you mention, Andy, that checks
> ipv6 traffic; can you post it?
> 

I found the patch, cleaned it up, and compile tested it against
net-next.  I traded some emails with John Eaglesham (cc'd) earlier this
year and though he planned to post it, I never followed up.

His comments about this patch were as follows:

"I've attached my patch for IPv6 transmit hashing for the nic bonding
driver.

"The algorithm I chose is based on 273,913 IPv6 client addresses I
gathered from webservers and ran through a test program that implemented
several algorithms. This algorithm provided the most even distribution
while using the fewest instructions.

"I've tested this on 2.6.39-rc4 and a similar patch to 2.6.18 (from
RHEL5 5.4.3) and it has performed as expected in both cases.

"Please let me know if you have any comments, otherwise I suppose the
next step is to propose the patch to LKML."

I would suggest we use this.  John or I could write an official
changelog and post this in it's own thread if it looks good to others.

---
 drivers/net/bonding/bond_main.c |   30 +++++++++++++++++++++++++-----
 1 files changed, 25 insertions(+), 5 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Yinglin Sun Oct. 12, 2011, 3:39 a.m. UTC | #1
On Tue, Oct 11, 2011 at 7:51 PM, Andy Gospodarek <andy@greyhouse.net> wrote:
> On Tue, Oct 11, 2011 at 08:58:59AM -0700, Jay Vosburgh wrote:
>> Andy Gospodarek <andy@greyhouse.net> wrote:
> [...]
>> >
>> >There have been some attempts to add support for ipv6 hashing this in
>> >the past, but none have been committed.  The best one I had seen was one
>> >that did some extensive testing one a wide variety of ipv6 traffic and
>> >it showed nice traffic distribution.  I'm not sure if it was ever posted
>> >upstream, so I will see if I can dig it up.
>> >
>> >Can you quantify how traffic was distributed with this algorithm?
>>
>>       As I recall, the IPv6 issues had to do with the "layer3+4" hash,
>> because the IPv6 TCP or UDP port numbers can be harder to get at than in
>> IPv4 (which typically has a fixed size header).  The above is just for
>> layer 2, so it only hits the IPv6 addresses, which don't move around.
>>
>>       That said, I believe that many IPv6 addresses are derived from
>> the MAC address, the autoconf addresses in particular, so s6_addr32[3]
>> may not show a lot more variation than just the MAC address.  I don't
>> know for sure though, since I haven't tested it.
>>
>>       I don't recall seeing the patch you mention, Andy, that checks
>> ipv6 traffic; can you post it?
>>
>
> I found the patch, cleaned it up, and compile tested it against
> net-next.  I traded some emails with John Eaglesham (cc'd) earlier this
> year and though he planned to post it, I never followed up.
>
> His comments about this patch were as follows:
>
> "I've attached my patch for IPv6 transmit hashing for the nic bonding
> driver.
>
> "The algorithm I chose is based on 273,913 IPv6 client addresses I
> gathered from webservers and ran through a test program that implemented
> several algorithms. This algorithm provided the most even distribution
> while using the fewest instructions.
>
> "I've tested this on 2.6.39-rc4 and a similar patch to 2.6.18 (from
> RHEL5 5.4.3) and it has performed as expected in both cases.
>
> "Please let me know if you have any comments, otherwise I suppose the
> next step is to propose the patch to LKML."
>
> I would suggest we use this.  John or I could write an official
> changelog and post this in it's own thread if it looks good to others.
>
> ---
>  drivers/net/bonding/bond_main.c |   30 +++++++++++++++++++++++++-----
>  1 files changed, 25 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 6191e63..335cb67 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -3368,11 +3368,20 @@ static struct notifier_block bond_inetaddr_notifier = {
>  static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count)
>  {
>        struct ethhdr *data = (struct ethhdr *)skb->data;
> -       struct iphdr *iph = ip_hdr(skb);
>
>        if (skb->protocol == htons(ETH_P_IP)) {
> +               struct iphdr *iph = ip_hdr(skb);
>                return ((ntohl(iph->saddr ^ iph->daddr) & 0xffff) ^
>                        (data->h_dest[5] ^ data->h_source[5])) % count;
> +       } else if (skb->protocol == htons(ETH_P_IPV6)) {
> +               struct ipv6hdr *ipv6h = ipv6_hdr(skb);
> +               u32 v6hash = (
> +                       (ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
> +                       (ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
> +                       (ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3])
> +               );
> +               v6hash = (v6hash >> 16) ^ (v6hash >> 8) ^ v6hash;
> +               return (v6hash ^ data->h_dest[5] ^ data->h_source[5]) % count;
>        }
>
>        return (data->h_dest[5] ^ data->h_source[5]) % count;
> @@ -3386,11 +3395,11 @@ static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count)
>  static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
>  {
>        struct ethhdr *data = (struct ethhdr *)skb->data;
> -       struct iphdr *iph = ip_hdr(skb);
> -       __be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
> -       int layer4_xor = 0;
> +       u32 layer4_xor = 0;
>
>        if (skb->protocol == htons(ETH_P_IP)) {
> +               struct iphdr *iph = ip_hdr(skb);
> +               __be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
>                if (!ip_is_fragment(iph) &&
>                    (iph->protocol == IPPROTO_TCP ||
>                     iph->protocol == IPPROTO_UDP)) {
> @@ -3398,7 +3407,18 @@ static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
>                }
>                return (layer4_xor ^
>                        ((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
> -
> +       } else if (skb->protocol == htons(ETH_P_IPV6)) {
> +               struct ipv6hdr *ipv6h = ipv6_hdr(skb);
> +               __be16 *layer4hdrv6 = (__be16 *)((u8 *)ipv6h + sizeof(*ipv6h));
> +               if (ipv6h->nexthdr == IPPROTO_TCP || ipv6h->nexthdr == IPPROTO_UDP) {

Does this work if this is a fragmentation packet? and if
ipv6h->nexthdr is not IPPROTO_TCP/IPPROTO_UDP, it doesn't mean this is
not TCP/UDP packet. We need to go through the extension header chain
and look at the last one. It's likely there are some other extension
headers before L4 header.

Yinglin

> +                       layer4_xor = (*layer4hdrv6 ^ *(layer4hdrv6 + 1));
> +               }
> +               layer4_xor ^= (
> +                       (ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
> +                       (ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
> +                       (ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3])
> +               );
> +               return ((layer4_xor >> 16) ^ (layer4_xor >> 8) ^ layer4_xor) % count;
>        }
>
>        return (data->h_dest[5] ^ data->h_source[5]) % count;
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Oct. 12, 2011, 4:06 a.m. UTC | #2
Le mardi 11 octobre 2011 à 20:39 -0700, Yinglin Sun a écrit :
> On Tue, Oct 11, 2011 at 7:51 PM, Andy Gospodarek <andy@greyhouse.net> wrote:
> >
> >        if (skb->protocol == htons(ETH_P_IP)) {
> > +               struct iphdr *iph = ip_hdr(skb);
> > +               __be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
> >                if (!ip_is_fragment(iph) &&
> >                    (iph->protocol == IPPROTO_TCP ||
> >                     iph->protocol == IPPROTO_UDP)) {
> > @@ -3398,7 +3407,18 @@ static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
> >                }
> >                return (layer4_xor ^
> >                        ((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
> > -
> > +       } else if (skb->protocol == htons(ETH_P_IPV6)) {
> > +               struct ipv6hdr *ipv6h = ipv6_hdr(skb);
> > +               __be16 *layer4hdrv6 = (__be16 *)((u8 *)ipv6h + sizeof(*ipv6h));
> > +               if (ipv6h->nexthdr == IPPROTO_TCP || ipv6h->nexthdr == IPPROTO_UDP) {
> 
> Does this work if this is a fragmentation packet? and if
> ipv6h->nexthdr is not IPPROTO_TCP/IPPROTO_UDP, it doesn't mean this is
> not TCP/UDP packet. We need to go through the extension header chain
> and look at the last one. It's likely there are some other extension
> headers before L4 header.
> 

Its a best effort.

If first header is not IPPROTO_TCP/UDP, I am not sure its wise to spend
time in hope to find layer4 info (missing anyway in fragments)

By the way I see no bound checking on SKB head : malicious packet could
make bond_xmit_hash_policy_l34() access unitialized memory.

We have same 'fastpath' in __skb_get_rxhash()


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John Eaglesham Oct. 12, 2011, 9:15 p.m. UTC | #3
Eric Dumazet wrote:
> Le mardi 11 octobre 2011 à 20:39 -0700, Yinglin Sun a écrit :
>> On Tue, Oct 11, 2011 at 7:51 PM, Andy Gospodarek <andy@greyhouse.net> wrote:
>>>        if (skb->protocol == htons(ETH_P_IP)) {
>>> +               struct iphdr *iph = ip_hdr(skb);
>>> +               __be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
>>>                if (!ip_is_fragment(iph) &&
>>>                    (iph->protocol == IPPROTO_TCP ||
>>>                     iph->protocol == IPPROTO_UDP)) {
>>> @@ -3398,7 +3407,18 @@ static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
>>>                }
>>>                return (layer4_xor ^
>>>                        ((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
>>> -
>>> +       } else if (skb->protocol == htons(ETH_P_IPV6)) {
>>> +               struct ipv6hdr *ipv6h = ipv6_hdr(skb);
>>> +               __be16 *layer4hdrv6 = (__be16 *)((u8 *)ipv6h + sizeof(*ipv6h));
>>> +               if (ipv6h->nexthdr == IPPROTO_TCP || ipv6h->nexthdr == IPPROTO_UDP) {
>> Does this work if this is a fragmentation packet? and if
>> ipv6h->nexthdr is not IPPROTO_TCP/IPPROTO_UDP, it doesn't mean this is
>> not TCP/UDP packet. We need to go through the extension header chain
>> and look at the last one. It's likely there are some other extension
>> headers before L4 header.
>>
> 
> Its a best effort.
> 
> If first header is not IPPROTO_TCP/UDP, I am not sure its wise to spend
> time in hope to find layer4 info (missing anyway in fragments)
> 
> By the way I see no bound checking on SKB head : malicious packet could
> make bond_xmit_hash_policy_l34() access unitialized memory.
> 
> We have same 'fastpath' in __skb_get_rxhash()
> 
> 

Thanks for keeping me in the loop on this.

I caught the bounds checking bug and added checks to code I haven't 
submitted in yet (I still need to test that it works before I submit a 
patch).

I don't like the idea of sticking a loop in this part of the code, 
especially one traversing a list of length controlled by outside users 
with no guarantee it is properly formed. I ran some tests gathering 
traffic and I never saw any packets with headers between the IPv6 and 
TCP or UDP, so at least in the real world right now this code should 
behave as expected for the vast majority of traffic. Things might change 
in the future, but if this code is clear/clean/simple and solves the 
problem today, I'd be happier with that than trying to parse a full IPv6 
header.

I'll test and post my revised patch (including bounds checking and 
documentation updates) in a day or so.

John
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 6191e63..335cb67 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3368,11 +3368,20 @@  static struct notifier_block bond_inetaddr_notifier = {
 static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count)
 {
 	struct ethhdr *data = (struct ethhdr *)skb->data;
-	struct iphdr *iph = ip_hdr(skb);
 
 	if (skb->protocol == htons(ETH_P_IP)) {
+		struct iphdr *iph = ip_hdr(skb);
 		return ((ntohl(iph->saddr ^ iph->daddr) & 0xffff) ^
 			(data->h_dest[5] ^ data->h_source[5])) % count;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+		u32 v6hash = (
+			(ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
+			(ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
+			(ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3])
+		);
+		v6hash = (v6hash >> 16) ^ (v6hash >> 8) ^ v6hash;
+		return (v6hash ^ data->h_dest[5] ^ data->h_source[5]) % count;
 	}
 
 	return (data->h_dest[5] ^ data->h_source[5]) % count;
@@ -3386,11 +3395,11 @@  static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count)
 static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
 {
 	struct ethhdr *data = (struct ethhdr *)skb->data;
-	struct iphdr *iph = ip_hdr(skb);
-	__be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
-	int layer4_xor = 0;
+	u32 layer4_xor = 0;
 
 	if (skb->protocol == htons(ETH_P_IP)) {
+		struct iphdr *iph = ip_hdr(skb);
+		__be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
 		if (!ip_is_fragment(iph) &&
 		    (iph->protocol == IPPROTO_TCP ||
 		     iph->protocol == IPPROTO_UDP)) {
@@ -3398,7 +3407,18 @@  static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
 		}
 		return (layer4_xor ^
 			((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
-
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+		__be16 *layer4hdrv6 = (__be16 *)((u8 *)ipv6h + sizeof(*ipv6h));
+		if (ipv6h->nexthdr == IPPROTO_TCP || ipv6h->nexthdr == IPPROTO_UDP) {
+			layer4_xor = (*layer4hdrv6 ^ *(layer4hdrv6 + 1));
+		}
+		layer4_xor ^= (
+			(ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
+			(ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
+			(ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3])
+		);
+		return ((layer4_xor >> 16) ^ (layer4_xor >> 8) ^ layer4_xor) % count;
 	}
 
 	return (data->h_dest[5] ^ data->h_source[5]) % count;