diff mbox

[net-next,V3,3/3] net: Add GRO support for vxlan traffic

Message ID 1389213278-2200-4-git-send-email-ogerlitz@mellanox.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Or Gerlitz Jan. 8, 2014, 8:34 p.m. UTC
Add gro handlers for vxlan using the udp gro infrastructure

On my setup, which is net-next (now with the mlx4 vxlan offloads patches) --
for single TCP session that goes through vxlan tunneling I got nice improvement
from 6.8Gbs to 11.5Gbs

--> UDP/VXLAN GRO disabled
$ netperf  -H 192.168.52.147 -c -C

$ netperf -t TCP_STREAM -H 192.168.52.147 -c -C
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.52.147 () port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  65536  65536    10.00      6799.75   12.54    24.79    0.604   1.195

--> UDP/VXLAN GRO enabled

$ netperf -t TCP_STREAM -H 192.168.52.147 -c -C
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.52.147 () port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  65536  65536    10.00      11562.72   24.90    20.34    0.706   0.577

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/vxlan.c |  129 ++++++++++++++++++++++++++++++++++++++++++++++++---
 include/net/vxlan.h |    1 +
 2 files changed, 123 insertions(+), 7 deletions(-)

Comments

Eric Dumazet Jan. 8, 2014, 10:09 p.m. UTC | #1
On Wed, 2014-01-08 at 22:34 +0200, Or Gerlitz wrote:
> +
> +static int vxlan_gro_complete(struct sk_buff *skb, int nhoff)
> +{
> +	struct ethhdr *eh;
> +	struct packet_offload *ptype;
> +	__be16 type;
> +	/* 22 = 8 bytes for the vlxan header + 14 bytes for the inner eth header */
> +	int vxlan_len  = 22;



I am pretty sure this can use existing macros or sizeof(...)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Jan. 8, 2014, 10:11 p.m. UTC | #2
On Wed, 2014-01-08 at 22:34 +0200, Or Gerlitz wrote:

> +
>  /* Notify netdevs that UDP port started listening */
> -static void vxlan_notify_add_rx_port(struct sock *sk)
> +static void vxlan_notify_add_rx_port(struct vxlan_sock *vs)
>  {
>  	struct net_device *dev;
> +	struct sock *sk = vs->sock->sk;
>  	struct net *net = sock_net(sk);
>  	sa_family_t sa_family = sk->sk_family;
>  	__be16 port = inet_sk(sk)->inet_sport;
> @@ -569,12 +671,16 @@ static void vxlan_notify_add_rx_port(struct sock *sk)
>  							    port);
>  	}
>  	rcu_read_unlock();
> +
> +	if (sa_family == AF_INET)
> +		call_rcu(&vs->rcu, vxlan_add_udp_offload);

Why waiting RCU grace period here ?

>  }
>  
>  /* Notify netdevs that UDP port is no more listening */
> -static void vxlan_notify_del_rx_port(struct sock *sk)
> +static void vxlan_notify_del_rx_port(struct vxlan_sock *vs)
>  {
>  	struct net_device *dev;
> +	struct sock *sk = vs->sock->sk;
>  	struct net *net = sock_net(sk);
>  	sa_family_t sa_family = sk->sk_family;
>  	__be16 port = inet_sk(sk)->inet_sport;
> @@ -586,6 +692,9 @@ static void vxlan_notify_del_rx_port(struct sock *sk)
>  							    port);
>  	}
>  	rcu_read_unlock();
> +
> +	if (sa_family == AF_INET)
> +		call_rcu(&vs->rcu, vxlan_del_udp_offload);
>  }

This looks buggy.

You need to :

1) remove the offload structure from list
2) Then wait rcu grace period, and finally free the memory



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Jan. 9, 2014, 6:28 a.m. UTC | #3
On 09/01/2014 00:09, Eric Dumazet wrote:
> On Wed, 2014-01-08 at 22:34 +0200, Or Gerlitz wrote:
>> +
>> +static int vxlan_gro_complete(struct sk_buff *skb, int nhoff)
>> +{
>> +	struct ethhdr *eh;
>> +	struct packet_offload *ptype;
>> +	__be16 type;
>> +	/* 22 = 8 bytes for the vlxan header + 14 bytes for the inner eth header */
>> +	int vxlan_len  = 22;
>
>
> I am pretty sure this can use existing macros or sizeof(...)

sure, will fix



>


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Jan. 9, 2014, 6:32 a.m. UTC | #4
On 09/01/2014 00:11, Eric Dumazet wrote:
> On Wed, 2014-01-08 at 22:34 +0200, Or Gerlitz wrote:
>
>> +
>>   /* Notify netdevs that UDP port started listening */
>> -static void vxlan_notify_add_rx_port(struct sock *sk)
>> +static void vxlan_notify_add_rx_port(struct vxlan_sock *vs)
>>   {
>>   	struct net_device *dev;
>> +	struct sock *sk = vs->sock->sk;
>>   	struct net *net = sock_net(sk);
>>   	sa_family_t sa_family = sk->sk_family;
>>   	__be16 port = inet_sk(sk)->inet_sport;
>> @@ -569,12 +671,16 @@ static void vxlan_notify_add_rx_port(struct sock *sk)
>>   							    port);
>>   	}
>>   	rcu_read_unlock();
>> +
>> +	if (sa_family == AF_INET)
>> +		call_rcu(&vs->rcu, vxlan_add_udp_offload);
> Why waiting RCU grace period here?

Basically the add operation can be done right away, however, since the 
delete operation can't be done
instantly when we want it, I wanted to protect against a series of 
add/del/add in times T1 < T2 < T3

T1 add(X)
T2 del(X)
T3 add(X)

where the delete is deferred and as a result the 2nd add is done before 
the delete and @ the end offload X is not added in the 2nd time.From 
your other comment below I conclude that I probably miss something about 
the rcu usage here, so will give it further thought.



>
>>   }
>>   
>>   /* Notify netdevs that UDP port is no more listening */
>> -static void vxlan_notify_del_rx_port(struct sock *sk)
>> +static void vxlan_notify_del_rx_port(struct vxlan_sock *vs)
>>   {
>>   	struct net_device *dev;
>> +	struct sock *sk = vs->sock->sk;
>>   	struct net *net = sock_net(sk);
>>   	sa_family_t sa_family = sk->sk_family;
>>   	__be16 port = inet_sk(sk)->inet_sport;
>> @@ -586,6 +692,9 @@ static void vxlan_notify_del_rx_port(struct sock *sk)
>>   							    port);
>>   	}
>>   	rcu_read_unlock();
>> +
>> +	if (sa_family == AF_INET)
>> +		call_rcu(&vs->rcu, vxlan_del_udp_offload);
>>   }
> This looks buggy.
>
> You need to :
>
> 1) remove the offload structure from list
> 2) Then wait rcu grace period, and finally free the memory
>
>
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 481f85d..e132f19 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -40,6 +40,7 @@ 
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/vxlan.h>
+#include <net/protocol.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
 #include <net/addrconf.h>
@@ -554,10 +555,111 @@  static int vxlan_fdb_append(struct vxlan_fdb *f,
 	return 1;
 }
 
+static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+{
+	struct sk_buff *p, **pp = NULL;
+	struct vxlanhdr *vh, *vh2;
+	struct ethhdr *eh, *eh2;
+	unsigned int hlen, off_vx, off_eth;
+	const struct packet_offload *ptype;
+	__be16 type;
+	int flush = 1;
+
+	off_vx = skb_gro_offset(skb);
+	hlen = off_vx + sizeof(*vh);
+	vh   = skb_gro_header_fast(skb, off_vx);
+	if (skb_gro_header_hard(skb, hlen)) {
+		vh = skb_gro_header_slow(skb, hlen, off_vx);
+		if (unlikely(!vh))
+			goto out;
+	}
+	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
+
+	off_eth = skb_gro_offset(skb);
+	hlen = off_eth + sizeof(*eh);
+	eh   = skb_gro_header_fast(skb, off_eth);
+	if (skb_gro_header_hard(skb, hlen)) {
+		eh = skb_gro_header_slow(skb, hlen, off_eth);
+		if (unlikely(!eh))
+			goto out;
+	}
+
+	flush = 0;
+
+	for (p = *head; p; p = p->next) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		vh2 = (struct vxlanhdr *)(p->data + off_vx);
+		eh2 = (struct ethhdr   *)(p->data + off_eth);
+		if (vh->vx_vni != vh2->vx_vni || compare_ether_header(eh, eh2)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+		goto found;
+	}
+
+found:
+	type = eh->h_proto;
+
+	rcu_read_lock();
+	ptype = gro_find_receive_by_type(type);
+	if (ptype == NULL) {
+		flush = 1;
+		goto out_unlock;
+	}
+
+	skb_gro_pull(skb, sizeof(*eh)); /* pull inner eth header */
+	pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+	rcu_read_unlock();
+out:
+	NAPI_GRO_CB(skb)->flush |= flush;
+
+	return pp;
+}
+
+static int vxlan_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	struct ethhdr *eh;
+	struct packet_offload *ptype;
+	__be16 type;
+	/* 22 = 8 bytes for the vlxan header + 14 bytes for the inner eth header */
+	int vxlan_len  = 22;
+	int err = -ENOSYS;
+
+	eh = (struct ethhdr *)(skb->data + nhoff + sizeof (struct vxlanhdr));
+	type = eh->h_proto;
+
+	rcu_read_lock();
+	ptype = gro_find_complete_by_type(type);
+	if (ptype != NULL)
+		err = ptype->callbacks.gro_complete(skb, nhoff + vxlan_len);
+
+	rcu_read_unlock();
+	return err;
+}
+
+static void vxlan_add_udp_offload(struct rcu_head *head)
+{
+	struct vxlan_sock *vs = container_of(head, struct vxlan_sock, rcu);
+
+	udp_add_offload(&vs->udp_offloads);
+}
+
+static void vxlan_del_udp_offload(struct rcu_head *head)
+{
+	struct vxlan_sock *vs = container_of(head, struct vxlan_sock, rcu);
+
+	udp_del_offload(&vs->udp_offloads);
+}
+
 /* Notify netdevs that UDP port started listening */
-static void vxlan_notify_add_rx_port(struct sock *sk)
+static void vxlan_notify_add_rx_port(struct vxlan_sock *vs)
 {
 	struct net_device *dev;
+	struct sock *sk = vs->sock->sk;
 	struct net *net = sock_net(sk);
 	sa_family_t sa_family = sk->sk_family;
 	__be16 port = inet_sk(sk)->inet_sport;
@@ -569,12 +671,16 @@  static void vxlan_notify_add_rx_port(struct sock *sk)
 							    port);
 	}
 	rcu_read_unlock();
+
+	if (sa_family == AF_INET)
+		call_rcu(&vs->rcu, vxlan_add_udp_offload);
 }
 
 /* Notify netdevs that UDP port is no more listening */
-static void vxlan_notify_del_rx_port(struct sock *sk)
+static void vxlan_notify_del_rx_port(struct vxlan_sock *vs)
 {
 	struct net_device *dev;
+	struct sock *sk = vs->sock->sk;
 	struct net *net = sock_net(sk);
 	sa_family_t sa_family = sk->sk_family;
 	__be16 port = inet_sk(sk)->inet_sport;
@@ -586,6 +692,9 @@  static void vxlan_notify_del_rx_port(struct sock *sk)
 							    port);
 	}
 	rcu_read_unlock();
+
+	if (sa_family == AF_INET)
+		call_rcu(&vs->rcu, vxlan_del_udp_offload);
 }
 
 /* Add new entry to forwarding table -- assumes lock held */
@@ -964,7 +1073,7 @@  void vxlan_sock_release(struct vxlan_sock *vs)
 	spin_lock(&vn->sock_lock);
 	hlist_del_rcu(&vs->hlist);
 	rcu_assign_sk_user_data(vs->sock->sk, NULL);
-	vxlan_notify_del_rx_port(sk);
+	vxlan_notify_del_rx_port(vs);
 	spin_unlock(&vn->sock_lock);
 
 	queue_work(vxlan_wq, &vs->del_work);
@@ -1125,8 +1234,8 @@  static void vxlan_rcv(struct vxlan_sock *vs,
 	 * leave the CHECKSUM_UNNECESSARY, the device checksummed it
 	 * for us. Otherwise force the upper layers to verify it.
 	 */
-	if (skb->ip_summed != CHECKSUM_UNNECESSARY || !skb->encapsulation ||
-	    !(vxlan->dev->features & NETIF_F_RXCSUM))
+	if ((skb->ip_summed != CHECKSUM_UNNECESSARY && skb->ip_summed != CHECKSUM_PARTIAL) ||
+	    !skb->encapsulation || !(vxlan->dev->features & NETIF_F_RXCSUM))
 		skb->ip_summed = CHECKSUM_NONE;
 
 	skb->encapsulation = 0;
@@ -2304,7 +2413,7 @@  static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
 	struct sock *sk;
 	unsigned int h;
 
-	vs = kmalloc(sizeof(*vs), GFP_KERNEL);
+	vs = kzalloc(sizeof(*vs), GFP_KERNEL);
 	if (!vs)
 		return ERR_PTR(-ENOMEM);
 
@@ -2329,9 +2438,15 @@  static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
 	vs->data = data;
 	rcu_assign_sk_user_data(vs->sock->sk, vs);
 
+	/* Initialize the vxlan udp offloads structure */
+	vs->udp_offloads.port = port;
+	vs->udp_offloads.callbacks.gro_receive  = vxlan_gro_receive;
+	vs->udp_offloads.callbacks.gro_complete = vxlan_gro_complete;
+	INIT_LIST_HEAD(&vs->udp_offloads.list);
+
 	spin_lock(&vn->sock_lock);
 	hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
-	vxlan_notify_add_rx_port(sk);
+	vxlan_notify_add_rx_port(vs);
 	spin_unlock(&vn->sock_lock);
 
 	/* Mark socket as an encapsulation socket. */
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 6b6d180..5deef1a 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -21,6 +21,7 @@  struct vxlan_sock {
 	struct rcu_head	  rcu;
 	struct hlist_head vni_list[VNI_HASH_SIZE];
 	atomic_t	  refcnt;
+	struct udp_offload udp_offloads;
 };
 
 struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,