Patchwork [net-next-2.6] packet: Add GSO/checksum offload support to af_packet sockets

login
register
mail settings
Submitter Sridhar Samudrala
Date Jan. 26, 2010, 8:30 p.m.
Message ID <1264537819.24933.122.camel@w-sridhar.beaverton.ibm.com>
Download mbox | patch
Permalink /patch/43726/
State Changes Requested
Delegated to: David Miller
Headers show

Comments

Sridhar Samudrala - Jan. 26, 2010, 8:30 p.m.
This patch adds GSO/checksum offload to af_packet sockets using
virtio_net_hdr. Based on Rusty's patch to add this support to tun.
It allows GSO/checksum offload to be enabled when using raw socket
backend with virtio_net.
Adds PACKET_VNET_HDR socket option to prepend virtio_net_hdr in the
receive path and process/skip virtio_net_hdr in the send path. This
option is only allowed with SOCK_RAW sockets attached to ethernet
type devices.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rusty Russell - Jan. 27, 2010, 1:52 a.m.
On Wed, 27 Jan 2010 07:00:19 am Sridhar Samudrala wrote:
> This patch adds GSO/checksum offload to af_packet sockets using
> virtio_net_hdr. Based on Rusty's patch to add this support to tun.
> It allows GSO/checksum offload to be enabled when using raw socket
> backend with virtio_net.
> Adds PACKET_VNET_HDR socket option to prepend virtio_net_hdr in the
> receive path and process/skip virtio_net_hdr in the send path. This
> option is only allowed with SOCK_RAW sockets attached to ethernet
> type devices.

This is really MST's baby, so I'm going to defer to his wisdom on this...

Cheers,
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin - Jan. 27, 2010, 11:42 a.m.
On Tue, Jan 26, 2010 at 12:30:19PM -0800, Sridhar Samudrala wrote:
> This patch adds GSO/checksum offload to af_packet sockets using
> virtio_net_hdr. Based on Rusty's patch to add this support to tun.
> It allows GSO/checksum offload to be enabled when using raw socket
> backend with virtio_net.
> Adds PACKET_VNET_HDR socket option to prepend virtio_net_hdr in the
> receive path and process/skip virtio_net_hdr in the send path. This
> option is only allowed with SOCK_RAW sockets attached to ethernet
> type devices.
> 
> Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>

So the main issue with this implemenation is that it silently fails for
non-ethernet protocols.  It would be better to detect unsupported
protocols and return an error to user.  This is same issue that was
pointed out by DaveM with my earlier attempt to solve a different
(related) problem:
http://lkml.org/lkml/2010/1/5/474
For an incomplete prototype attempting to solve the issue in a generic way:
http://lkml.org/lkml/2010/1/6/56

A couple of additional comments below.

> diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
> index 4021d47..aa57a5f 100644
> --- a/include/linux/if_packet.h
> +++ b/include/linux/if_packet.h
> @@ -46,6 +46,7 @@ struct sockaddr_ll {
>  #define PACKET_RESERVE			12
>  #define PACKET_TX_RING			13
>  #define PACKET_LOSS			14
> +#define PACKET_VNET_HDR			15
>  
>  struct tpacket_stats {
>  	unsigned int	tp_packets;
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index 53633c5..36d5360 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -80,6 +80,8 @@
>  #include <linux/init.h>
>  #include <linux/mutex.h>
>  #include <linux/if_vlan.h>
> +#include <linux/virtio_net.h>
> +#include <linux/if_arp.h>
>  
>  #ifdef CONFIG_INET
>  #include <net/inet_common.h>
> @@ -193,7 +195,8 @@ struct packet_sock {
>  	struct mutex		pg_vec_lock;
>  	unsigned int		running:1,	/* prot_hook is attached*/
>  				auxdata:1,
> -				origdev:1;
> +				origdev:1,
> +				vnet_hdr:1;
>  	int			ifindex;	/* bound device		*/
>  	__be16			num;
>  	struct packet_mclist	*mclist;
> @@ -1056,6 +1059,30 @@ out:
>  }
>  #endif
>  
> +static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
> +					       size_t reserve, size_t len,
> +					       size_t linear, int noblock,
> +					       int *err)
> +{
> +	struct sk_buff *skb;
> +
> +	/* Under a page?  Don't bother with paged skb. */
> +	if (prepad + len < PAGE_SIZE || !linear)
> +		linear = len;
> +
> +	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
> +				   err);
> +	if (!skb)
> +		return NULL;
> +
> +	skb_reserve(skb, reserve);
> +	skb_put(skb, linear);
> +	skb->data_len = len - linear;
> +	skb->len += len - linear;
> +
> +	return skb;
> +}
> +
>  static int packet_snd(struct socket *sock,
>  			  struct msghdr *msg, size_t len)
>  {
> @@ -1066,14 +1093,15 @@ static int packet_snd(struct socket *sock,
>  	__be16 proto;
>  	unsigned char *addr;
>  	int ifindex, err, reserve = 0;
> +	struct virtio_net_hdr vnethdr = { 0 };
> +	int offset = 0;
> +	struct packet_sock *po = pkt_sk(sk);
>  
>  	/*
>  	 *	Get and verify the address.
>  	 */
>  
>  	if (saddr == NULL) {
> -		struct packet_sock *po = pkt_sk(sk);
> -
>  		ifindex	= po->ifindex;
>  		proto	= po->num;
>  		addr	= NULL;
> @@ -1100,25 +1128,52 @@ static int packet_snd(struct socket *sock,
>  	if (!(dev->flags & IFF_UP))
>  		goto out_unlock;
>  
> -	err = -EMSGSIZE;
> -	if (len > dev->mtu+reserve)
> -		goto out_unlock;
> +	if (po->vnet_hdr) {
> +		err = -EINVAL;
> +		if (dev->type != ARPHRD_ETHER)
> +			goto out_unlock;
> +
> +		if (len < sizeof(vnethdr))
> +			goto out_unlock;
>  
> -	skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
> -				msg->msg_flags & MSG_DONTWAIT, &err);
> +		len -= sizeof(vnethdr);
> +
> +		err = -EFAULT;
> +		if (memcpy_fromiovec((void *)&vnethdr, msg->msg_iov,
> +					sizeof(vnethdr))) 
> +			goto out_unlock;
> +	
> +		if ((vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
> +		    (vnethdr.csum_start + vnethdr.csum_offset + 2 >
> +		      vnethdr.hdr_len))
> +			vnethdr.hdr_len = vnethdr.csum_start +
> +						 vnethdr.csum_offset + 2;
> +
> +		err = -EINVAL;
> +		if (vnethdr.hdr_len > len)
> +			goto out_unlock;
> +	} else {
> +		err = -EMSGSIZE;
> +		if (len > dev->mtu+reserve)
> +			goto out_unlock;

IMO we should always perform the length check if GSO is off.

> +	}
> +
> +	err = -ENOBUFS;
> +	skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
> +			       LL_RESERVED_SPACE(dev), len, vnethdr.hdr_len,
> +			       msg->msg_flags & MSG_DONTWAIT, &err);
>  	if (skb == NULL)
>  		goto out_unlock;
>  
> -	skb_reserve(skb, LL_RESERVED_SPACE(dev));
> -	skb_reset_network_header(skb);
> +	skb_set_network_header(skb, reserve);

I think the above is wrong for vlans?

>  
>  	err = -EINVAL;
>  	if (sock->type == SOCK_DGRAM &&
> -	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
> +	    (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
>  		goto out_free;
>  
>  	/* Returns -EFAULT on error */
> -	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
> +	err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
>  	if (err)
>  		goto out_free;
>  
> @@ -1127,6 +1182,51 @@ static int packet_snd(struct socket *sock,
>  	skb->priority = sk->sk_priority;
>  	skb->mark = sk->sk_mark;
>  
> +	if (po->vnet_hdr) {
> +		skb_reset_mac_header(skb);
> +		skb->protocol = eth_hdr(skb)->h_proto;
> +

Is this also broken for vlans?

> +		if (vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> +			if (!skb_partial_csum_set(skb, vnethdr.csum_start,
> +						  vnethdr.csum_offset)) {
> +				err = -EINVAL;
> +				goto out_free;
> +			}
> +		}
> +
> +		if (vnethdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
> +			switch (vnethdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
> +			case VIRTIO_NET_HDR_GSO_TCPV4:
> +				skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
> +				break;
> +			case VIRTIO_NET_HDR_GSO_TCPV6:
> +				skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
> +				break;
> +			case VIRTIO_NET_HDR_GSO_UDP:
> +				skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
> +				break;
> +			default:
> +				err = -EINVAL;
> +				goto out_free;
> +			}
> +
> +			if (vnethdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
> +				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
> +
> +			skb_shinfo(skb)->gso_size = vnethdr.gso_size;
> +			if (skb_shinfo(skb)->gso_size == 0) {
> +				err = -EINVAL;
> +				goto out_free;
> +                	}
> +
> +			/* Header must be checked, and gso_segs computed. */
> +			skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
> +			skb_shinfo(skb)->gso_segs = 0;
> +		}
> +
> +		len += sizeof(vnethdr);
> +	}
> +
>  	/*
>  	 *	Now send it
>  	 */
> @@ -1420,6 +1520,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
>  	struct sk_buff *skb;
>  	int copied, err;
>  	struct sockaddr_ll *sll;
> +	int vnet_hdr_len = 0;
>  
>  	err = -EINVAL;
>  	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
> @@ -1451,6 +1552,44 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
>  	if (skb == NULL)
>  		goto out;
>  
> +	if (pkt_sk(sk)->vnet_hdr) {
> +		struct virtio_net_hdr vnethdr = { 0 };
> +
> +		vnet_hdr_len = sizeof(vnethdr);
> +		if ((len -= vnet_hdr_len) < 0)
> +			return -EINVAL;
> +
> +		if (skb_is_gso(skb)) {
> +			struct skb_shared_info *sinfo = skb_shinfo(skb);
> +
> +			/* This is a hint as to how much should be linear. */
> +			vnethdr.hdr_len = skb_headlen(skb);
> +			vnethdr.gso_size = sinfo->gso_size;
> +			if (sinfo->gso_type & SKB_GSO_TCPV4)
> +				vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
> +			else if (sinfo->gso_type & SKB_GSO_TCPV6)
> +				vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
> +			else if (sinfo->gso_type & SKB_GSO_UDP)
> +				vnethdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
> +			else
> +				BUG();

Is there any chance this can get SKB_GSO_FCOE by binding to
an appropriate interface?  Maybe we don't want to BUG().

> +			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
> +				vnethdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
> +		} else
> +			vnethdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
> +
> +		if (skb->ip_summed == CHECKSUM_PARTIAL) {
> +			vnethdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
> +			vnethdr.csum_start = skb->csum_start - skb_headroom(skb);
> +			vnethdr.csum_offset = skb->csum_offset;
> +		} /* else everything is zero */
> +
> +		if (unlikely(memcpy_toiovec(msg->msg_iov, (void *)&vnethdr,
> +					    sizeof(vnethdr)))) {
> +			return -EFAULT;
> +		}
> +	}
> +
>  	/*
>  	 *	If the address length field is there to be filled in, we fill
>  	 *	it in now.
> @@ -1502,7 +1641,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
>  	 *	Free or return the buffer as appropriate. Again this
>  	 *	hides all the races and re-entrancy issues from us.
>  	 */
> -	err = (flags&MSG_TRUNC) ? skb->len : copied;
> +	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
>  
>  out_free:
>  	skb_free_datagram(sk, skb);
> @@ -1826,6 +1965,22 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
>  		po->origdev = !!val;
>  		return 0;
>  	}
> +	case PACKET_VNET_HDR:
> +	{
> +		int val;
> +
> +		if (sock->type != SOCK_RAW)
> +			return -EINVAL;
> +		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)	
> +			return -EBUSY;

Another way to get a broken ring + vnet hdr configuration
would be to enable vnet hdr first and mmap second.
I think we need to guard against this as well, by checking vnet_hdr
when tx/rx ring is enabled.

> +		if (optlen < sizeof(val))
> +			return -EINVAL;
> +		if (copy_from_user(&val, optval, sizeof(val)))
> +			return -EFAULT;
> +
> +		po->vnet_hdr = !!val;
> +		return 0;
> +	}
>  	default:
>  		return -ENOPROTOOPT;
>  	}
> @@ -1876,6 +2031,13 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
>  
>  		data = &val;
>  		break;
> +	case PACKET_VNET_HDR:
> +		if (len > sizeof(int))
> +			len = sizeof(int);
> +		val = po->vnet_hdr;
> +
> +		data = &val;
> +		break;
>  #ifdef CONFIG_PACKET_MMAP
>  	case PACKET_VERSION:
>  		if (len > sizeof(int))
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sridhar Samudrala - Jan. 27, 2010, 5:42 p.m.
On Wed, 2010-01-27 at 13:42 +0200, Michael S. Tsirkin wrote: 
> On Tue, Jan 26, 2010 at 12:30:19PM -0800, Sridhar Samudrala wrote:
> > This patch adds GSO/checksum offload to af_packet sockets using
> > virtio_net_hdr. Based on Rusty's patch to add this support to tun.
> > It allows GSO/checksum offload to be enabled when using raw socket
> > backend with virtio_net.
> > Adds PACKET_VNET_HDR socket option to prepend virtio_net_hdr in the
> > receive path and process/skip virtio_net_hdr in the send path. This
> > option is only allowed with SOCK_RAW sockets attached to ethernet
> > type devices.
> > 
> > Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
> 
> So the main issue with this implemenation is that it silently fails for
> non-ethernet protocols.  It would be better to detect unsupported
> protocols and return an error to user. 

Yes. I could return EINVAL or EOPNOTSUPP when trying to send/receive a
packet with virtio_net_hdr on non-ethernet devices.

> This is same issue that was
> pointed out by DaveM with my earlier attempt to solve a different
> (related) problem:
> http://lkml.org/lkml/2010/1/5/474
> For an incomplete prototype attempting to solve the issue in a generic way:
> http://lkml.org/lkml/2010/1/6/56
> 
> A couple of additional comments below.
> 
> > diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
> > index 4021d47..aa57a5f 100644
> > --- a/include/linux/if_packet.h
> > +++ b/include/linux/if_packet.h
> > @@ -46,6 +46,7 @@ struct sockaddr_ll {
> >  #define PACKET_RESERVE			12
> >  #define PACKET_TX_RING			13
> >  #define PACKET_LOSS			14
> > +#define PACKET_VNET_HDR			15
> >  
> >  struct tpacket_stats {
> >  	unsigned int	tp_packets;
> > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > index 53633c5..36d5360 100644
> > --- a/net/packet/af_packet.c
> > +++ b/net/packet/af_packet.c
> > @@ -80,6 +80,8 @@
> >  #include <linux/init.h>
> >  #include <linux/mutex.h>
> >  #include <linux/if_vlan.h>
> > +#include <linux/virtio_net.h>
> > +#include <linux/if_arp.h>
> >  
> >  #ifdef CONFIG_INET
> >  #include <net/inet_common.h>
> > @@ -193,7 +195,8 @@ struct packet_sock {
> >  	struct mutex		pg_vec_lock;
> >  	unsigned int		running:1,	/* prot_hook is attached*/
> >  				auxdata:1,
> > -				origdev:1;
> > +				origdev:1,
> > +				vnet_hdr:1;
> >  	int			ifindex;	/* bound device		*/
> >  	__be16			num;
> >  	struct packet_mclist	*mclist;
> > @@ -1056,6 +1059,30 @@ out:
> >  }
> >  #endif
> >  
> > +static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
> > +					       size_t reserve, size_t len,
> > +					       size_t linear, int noblock,
> > +					       int *err)
> > +{
> > +	struct sk_buff *skb;
> > +
> > +	/* Under a page?  Don't bother with paged skb. */
> > +	if (prepad + len < PAGE_SIZE || !linear)
> > +		linear = len;
> > +
> > +	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
> > +				   err);
> > +	if (!skb)
> > +		return NULL;
> > +
> > +	skb_reserve(skb, reserve);
> > +	skb_put(skb, linear);
> > +	skb->data_len = len - linear;
> > +	skb->len += len - linear;
> > +
> > +	return skb;
> > +}
> > +
> >  static int packet_snd(struct socket *sock,
> >  			  struct msghdr *msg, size_t len)
> >  {
> > @@ -1066,14 +1093,15 @@ static int packet_snd(struct socket *sock,
> >  	__be16 proto;
> >  	unsigned char *addr;
> >  	int ifindex, err, reserve = 0;
> > +	struct virtio_net_hdr vnethdr = { 0 };
> > +	int offset = 0;
> > +	struct packet_sock *po = pkt_sk(sk);
> >  
> >  	/*
> >  	 *	Get and verify the address.
> >  	 */
> >  
> >  	if (saddr == NULL) {
> > -		struct packet_sock *po = pkt_sk(sk);
> > -
> >  		ifindex	= po->ifindex;
> >  		proto	= po->num;
> >  		addr	= NULL;
> > @@ -1100,25 +1128,52 @@ static int packet_snd(struct socket *sock,
> >  	if (!(dev->flags & IFF_UP))
> >  		goto out_unlock;
> >  
> > -	err = -EMSGSIZE;
> > -	if (len > dev->mtu+reserve)
> > -		goto out_unlock;
> > +	if (po->vnet_hdr) {
> > +		err = -EINVAL;
> > +		if (dev->type != ARPHRD_ETHER)
> > +			goto out_unlock;
> > +
> > +		if (len < sizeof(vnethdr))
> > +			goto out_unlock;
> >  
> > -	skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
> > -				msg->msg_flags & MSG_DONTWAIT, &err);
> > +		len -= sizeof(vnethdr);
> > +
> > +		err = -EFAULT;
> > +		if (memcpy_fromiovec((void *)&vnethdr, msg->msg_iov,
> > +					sizeof(vnethdr))) 
> > +			goto out_unlock;
> > +	
> > +		if ((vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
> > +		    (vnethdr.csum_start + vnethdr.csum_offset + 2 >
> > +		      vnethdr.hdr_len))
> > +			vnethdr.hdr_len = vnethdr.csum_start +
> > +						 vnethdr.csum_offset + 2;
> > +
> > +		err = -EINVAL;
> > +		if (vnethdr.hdr_len > len)
> > +			goto out_unlock;
> > +	} else {
> > +		err = -EMSGSIZE;
> > +		if (len > dev->mtu+reserve)
> > +			goto out_unlock;
> 
> IMO we should always perform the length check if GSO is off.
OK. I will fix this. 
> 
> > +	}
> > +
> > +	err = -ENOBUFS;
> > +	skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
> > +			       LL_RESERVED_SPACE(dev), len, vnethdr.hdr_len,
> > +			       msg->msg_flags & MSG_DONTWAIT, &err);
> >  	if (skb == NULL)
> >  		goto out_unlock;
> >  
> > -	skb_reserve(skb, LL_RESERVED_SPACE(dev));
> > -	skb_reset_network_header(skb);
> > +	skb_set_network_header(skb, reserve);
> 
> I think the above is wrong for vlans?

I also thought we need to address vlans here, but even tun doesn't
handle this in the send routine. I submitted a patch that fixed
skb_gso_segment() to handle vlan packets. This will address both
tun and packet sockets.
http://thread.gmane.org/gmane.linux.network/150198

With this patch, i tested vlans with both ipv4 and ipv6.

> 
> >  
> >  	err = -EINVAL;
> >  	if (sock->type == SOCK_DGRAM &&
> > -	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
> > +	    (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
> >  		goto out_free;
> >  
> >  	/* Returns -EFAULT on error */
> > -	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
> > +	err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
> >  	if (err)
> >  		goto out_free;
> >  
> > @@ -1127,6 +1182,51 @@ static int packet_snd(struct socket *sock,
> >  	skb->priority = sk->sk_priority;
> >  	skb->mark = sk->sk_mark;
> >  
> > +	if (po->vnet_hdr) {
> > +		skb_reset_mac_header(skb);
> > +		skb->protocol = eth_hdr(skb)->h_proto;
> > +
> 
> Is this also broken for vlans?

Same as above. 
> 
> > +		if (vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> > +			if (!skb_partial_csum_set(skb, vnethdr.csum_start,
> > +						  vnethdr.csum_offset)) {
> > +				err = -EINVAL;
> > +				goto out_free;
> > +			}
> > +		}
> > +
> > +		if (vnethdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
> > +			switch (vnethdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
> > +			case VIRTIO_NET_HDR_GSO_TCPV4:
> > +				skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
> > +				break;
> > +			case VIRTIO_NET_HDR_GSO_TCPV6:
> > +				skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
> > +				break;
> > +			case VIRTIO_NET_HDR_GSO_UDP:
> > +				skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
> > +				break;
> > +			default:
> > +				err = -EINVAL;
> > +				goto out_free;
> > +			}
> > +
> > +			if (vnethdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
> > +				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
> > +
> > +			skb_shinfo(skb)->gso_size = vnethdr.gso_size;
> > +			if (skb_shinfo(skb)->gso_size == 0) {
> > +				err = -EINVAL;
> > +				goto out_free;
> > +                	}
> > +
> > +			/* Header must be checked, and gso_segs computed. */
> > +			skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
> > +			skb_shinfo(skb)->gso_segs = 0;
> > +		}
> > +
> > +		len += sizeof(vnethdr);
> > +	}
> > +
> >  	/*
> >  	 *	Now send it
> >  	 */
> > @@ -1420,6 +1520,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
> >  	struct sk_buff *skb;
> >  	int copied, err;
> >  	struct sockaddr_ll *sll;
> > +	int vnet_hdr_len = 0;
> >  
> >  	err = -EINVAL;
> >  	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
> > @@ -1451,6 +1552,44 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
> >  	if (skb == NULL)
> >  		goto out;
> >  
> > +	if (pkt_sk(sk)->vnet_hdr) {
> > +		struct virtio_net_hdr vnethdr = { 0 };
> > +
> > +		vnet_hdr_len = sizeof(vnethdr);
> > +		if ((len -= vnet_hdr_len) < 0)
> > +			return -EINVAL;
> > +
> > +		if (skb_is_gso(skb)) {
> > +			struct skb_shared_info *sinfo = skb_shinfo(skb);
> > +
> > +			/* This is a hint as to how much should be linear. */
> > +			vnethdr.hdr_len = skb_headlen(skb);
> > +			vnethdr.gso_size = sinfo->gso_size;
> > +			if (sinfo->gso_type & SKB_GSO_TCPV4)
> > +				vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
> > +			else if (sinfo->gso_type & SKB_GSO_TCPV6)
> > +				vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
> > +			else if (sinfo->gso_type & SKB_GSO_UDP)
> > +				vnethdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
> > +			else
> > +				BUG();
> 
> Is there any chance this can get SKB_GSO_FCOE by binding to
> an appropriate interface?  Maybe we don't want to BUG().
I could return -EINVAL in that case.

> 
> > +			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
> > +				vnethdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
> > +		} else
> > +			vnethdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
> > +
> > +		if (skb->ip_summed == CHECKSUM_PARTIAL) {
> > +			vnethdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
> > +			vnethdr.csum_start = skb->csum_start - skb_headroom(skb);
> > +			vnethdr.csum_offset = skb->csum_offset;
> > +		} /* else everything is zero */
> > +
> > +		if (unlikely(memcpy_toiovec(msg->msg_iov, (void *)&vnethdr,
> > +					    sizeof(vnethdr)))) {
> > +			return -EFAULT;
> > +		}
> > +	}
> > +
> >  	/*
> >  	 *	If the address length field is there to be filled in, we fill
> >  	 *	it in now.
> > @@ -1502,7 +1641,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
> >  	 *	Free or return the buffer as appropriate. Again this
> >  	 *	hides all the races and re-entrancy issues from us.
> >  	 */
> > -	err = (flags&MSG_TRUNC) ? skb->len : copied;
> > +	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
> >  
> >  out_free:
> >  	skb_free_datagram(sk, skb);
> > @@ -1826,6 +1965,22 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
> >  		po->origdev = !!val;
> >  		return 0;
> >  	}
> > +	case PACKET_VNET_HDR:
> > +	{
> > +		int val;
> > +
> > +		if (sock->type != SOCK_RAW)
> > +			return -EINVAL;
> > +		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)	
> > +			return -EBUSY;
> 
> Another way to get a broken ring + vnet hdr configuration
> would be to enable vnet hdr first and mmap second.
> I think we need to guard against this as well, by checking vnet_hdr
> when tx/rx ring is enabled.

OK. i will add a check when setting PACKET_RX_RING/TX_RING socket
options. 
> > +		if (optlen < sizeof(val))
> > +			return -EINVAL;
> > +		if (copy_from_user(&val, optval, sizeof(val)))
> > +			return -EFAULT;
> > +
> > +		po->vnet_hdr = !!val;
> > +		return 0;
> > +	}
> >  	default:
> >  		return -ENOPROTOOPT;
> >  	}
> > @@ -1876,6 +2031,13 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
> >  
> >  		data = &val;
> >  		break;
> > +	case PACKET_VNET_HDR:
> > +		if (len > sizeof(int))
> > +			len = sizeof(int);
> > +		val = po->vnet_hdr;
> > +
> > +		data = &val;
> > +		break;
> >  #ifdef CONFIG_PACKET_MMAP
> >  	case PACKET_VERSION:
> >  		if (len > sizeof(int))
> > 
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin - Jan. 27, 2010, 6:02 p.m.
On Wed, Jan 27, 2010 at 09:42:37AM -0800, Sridhar Samudrala wrote:
> On Wed, 2010-01-27 at 13:42 +0200, Michael S. Tsirkin wrote: 
> > On Tue, Jan 26, 2010 at 12:30:19PM -0800, Sridhar Samudrala wrote:
> > > This patch adds GSO/checksum offload to af_packet sockets using
> > > virtio_net_hdr. Based on Rusty's patch to add this support to tun.
> > > It allows GSO/checksum offload to be enabled when using raw socket
> > > backend with virtio_net.
> > > Adds PACKET_VNET_HDR socket option to prepend virtio_net_hdr in the
> > > receive path and process/skip virtio_net_hdr in the send path. This
> > > option is only allowed with SOCK_RAW sockets attached to ethernet
> > > type devices.
> > > 
> > > Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
> > 
> > So the main issue with this implemenation is that it silently fails for
> > non-ethernet protocols.  It would be better to detect unsupported
> > protocols and return an error to user. 
> 
> Yes. I could return EINVAL or EOPNOTSUPP when trying to send/receive a
> packet with virtio_net_hdr on non-ethernet devices.

non-ethernet *protocols* are at issue here.

> > This is same issue that was
> > pointed out by DaveM with my earlier attempt to solve a different
> > (related) problem:
> > http://lkml.org/lkml/2010/1/5/474
> > For an incomplete prototype attempting to solve the issue in a generic way:
> > http://lkml.org/lkml/2010/1/6/56
> > 
> > A couple of additional comments below.
> > 
> > > diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
> > > index 4021d47..aa57a5f 100644
> > > --- a/include/linux/if_packet.h
> > > +++ b/include/linux/if_packet.h
> > > @@ -46,6 +46,7 @@ struct sockaddr_ll {
> > >  #define PACKET_RESERVE			12
> > >  #define PACKET_TX_RING			13
> > >  #define PACKET_LOSS			14
> > > +#define PACKET_VNET_HDR			15
> > >  
> > >  struct tpacket_stats {
> > >  	unsigned int	tp_packets;
> > > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > > index 53633c5..36d5360 100644
> > > --- a/net/packet/af_packet.c
> > > +++ b/net/packet/af_packet.c
> > > @@ -80,6 +80,8 @@
> > >  #include <linux/init.h>
> > >  #include <linux/mutex.h>
> > >  #include <linux/if_vlan.h>
> > > +#include <linux/virtio_net.h>
> > > +#include <linux/if_arp.h>
> > >  
> > >  #ifdef CONFIG_INET
> > >  #include <net/inet_common.h>
> > > @@ -193,7 +195,8 @@ struct packet_sock {
> > >  	struct mutex		pg_vec_lock;
> > >  	unsigned int		running:1,	/* prot_hook is attached*/
> > >  				auxdata:1,
> > > -				origdev:1;
> > > +				origdev:1,
> > > +				vnet_hdr:1;
> > >  	int			ifindex;	/* bound device		*/
> > >  	__be16			num;
> > >  	struct packet_mclist	*mclist;
> > > @@ -1056,6 +1059,30 @@ out:
> > >  }
> > >  #endif
> > >  
> > > +static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
> > > +					       size_t reserve, size_t len,
> > > +					       size_t linear, int noblock,
> > > +					       int *err)
> > > +{
> > > +	struct sk_buff *skb;
> > > +
> > > +	/* Under a page?  Don't bother with paged skb. */
> > > +	if (prepad + len < PAGE_SIZE || !linear)
> > > +		linear = len;
> > > +
> > > +	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
> > > +				   err);
> > > +	if (!skb)
> > > +		return NULL;
> > > +
> > > +	skb_reserve(skb, reserve);
> > > +	skb_put(skb, linear);
> > > +	skb->data_len = len - linear;
> > > +	skb->len += len - linear;
> > > +
> > > +	return skb;
> > > +}
> > > +
> > >  static int packet_snd(struct socket *sock,
> > >  			  struct msghdr *msg, size_t len)
> > >  {
> > > @@ -1066,14 +1093,15 @@ static int packet_snd(struct socket *sock,
> > >  	__be16 proto;
> > >  	unsigned char *addr;
> > >  	int ifindex, err, reserve = 0;
> > > +	struct virtio_net_hdr vnethdr = { 0 };
> > > +	int offset = 0;
> > > +	struct packet_sock *po = pkt_sk(sk);
> > >  
> > >  	/*
> > >  	 *	Get and verify the address.
> > >  	 */
> > >  
> > >  	if (saddr == NULL) {
> > > -		struct packet_sock *po = pkt_sk(sk);
> > > -
> > >  		ifindex	= po->ifindex;
> > >  		proto	= po->num;
> > >  		addr	= NULL;
> > > @@ -1100,25 +1128,52 @@ static int packet_snd(struct socket *sock,
> > >  	if (!(dev->flags & IFF_UP))
> > >  		goto out_unlock;
> > >  
> > > -	err = -EMSGSIZE;
> > > -	if (len > dev->mtu+reserve)
> > > -		goto out_unlock;
> > > +	if (po->vnet_hdr) {
> > > +		err = -EINVAL;
> > > +		if (dev->type != ARPHRD_ETHER)
> > > +			goto out_unlock;
> > > +
> > > +		if (len < sizeof(vnethdr))
> > > +			goto out_unlock;
> > >  
> > > -	skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
> > > -				msg->msg_flags & MSG_DONTWAIT, &err);
> > > +		len -= sizeof(vnethdr);
> > > +
> > > +		err = -EFAULT;
> > > +		if (memcpy_fromiovec((void *)&vnethdr, msg->msg_iov,
> > > +					sizeof(vnethdr))) 
> > > +			goto out_unlock;
> > > +	
> > > +		if ((vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
> > > +		    (vnethdr.csum_start + vnethdr.csum_offset + 2 >
> > > +		      vnethdr.hdr_len))
> > > +			vnethdr.hdr_len = vnethdr.csum_start +
> > > +						 vnethdr.csum_offset + 2;
> > > +
> > > +		err = -EINVAL;
> > > +		if (vnethdr.hdr_len > len)
> > > +			goto out_unlock;
> > > +	} else {
> > > +		err = -EMSGSIZE;
> > > +		if (len > dev->mtu+reserve)
> > > +			goto out_unlock;
> > 
> > IMO we should always perform the length check if GSO is off.
> OK. I will fix this. 
> > 
> > > +	}
> > > +
> > > +	err = -ENOBUFS;
> > > +	skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
> > > +			       LL_RESERVED_SPACE(dev), len, vnethdr.hdr_len,
> > > +			       msg->msg_flags & MSG_DONTWAIT, &err);
> > >  	if (skb == NULL)
> > >  		goto out_unlock;
> > >  
> > > -	skb_reserve(skb, LL_RESERVED_SPACE(dev));
> > > -	skb_reset_network_header(skb);
> > > +	skb_set_network_header(skb, reserve);
> > 
> > I think the above is wrong for vlans?
> 
> I also thought we need to address vlans here, but even tun doesn't
> handle this in the send routine. I submitted a patch that fixed
> skb_gso_segment() to handle vlan packets. This will address both
> tun and packet sockets.
> http://thread.gmane.org/gmane.linux.network/150198
> 
> With this patch, i tested vlans with both ipv4 and ipv6.

Well, there are more protocols than just vlans :)
BTW, if there are dependencies between patches, you probably
want to put them in a patchset so they can be judged together.


> > 
> > >  
> > >  	err = -EINVAL;
> > >  	if (sock->type == SOCK_DGRAM &&
> > > -	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
> > > +	    (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
> > >  		goto out_free;
> > >  
> > >  	/* Returns -EFAULT on error */
> > > -	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
> > > +	err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
> > >  	if (err)
> > >  		goto out_free;
> > >  
> > > @@ -1127,6 +1182,51 @@ static int packet_snd(struct socket *sock,
> > >  	skb->priority = sk->sk_priority;
> > >  	skb->mark = sk->sk_mark;
> > >  
> > > +	if (po->vnet_hdr) {
> > > +		skb_reset_mac_header(skb);
> > > +		skb->protocol = eth_hdr(skb)->h_proto;
> > > +
> > 
> > Is this also broken for vlans?
> 
> Same as above. 
> > 
> > > +		if (vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> > > +			if (!skb_partial_csum_set(skb, vnethdr.csum_start,
> > > +						  vnethdr.csum_offset)) {
> > > +				err = -EINVAL;
> > > +				goto out_free;
> > > +			}
> > > +		}
> > > +
> > > +		if (vnethdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
> > > +			switch (vnethdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
> > > +			case VIRTIO_NET_HDR_GSO_TCPV4:
> > > +				skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
> > > +				break;
> > > +			case VIRTIO_NET_HDR_GSO_TCPV6:
> > > +				skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
> > > +				break;
> > > +			case VIRTIO_NET_HDR_GSO_UDP:
> > > +				skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
> > > +				break;
> > > +			default:
> > > +				err = -EINVAL;
> > > +				goto out_free;
> > > +			}
> > > +
> > > +			if (vnethdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
> > > +				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
> > > +
> > > +			skb_shinfo(skb)->gso_size = vnethdr.gso_size;
> > > +			if (skb_shinfo(skb)->gso_size == 0) {
> > > +				err = -EINVAL;
> > > +				goto out_free;
> > > +                	}
> > > +
> > > +			/* Header must be checked, and gso_segs computed. */
> > > +			skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
> > > +			skb_shinfo(skb)->gso_segs = 0;
> > > +		}
> > > +
> > > +		len += sizeof(vnethdr);
> > > +	}
> > > +
> > >  	/*
> > >  	 *	Now send it
> > >  	 */
> > > @@ -1420,6 +1520,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
> > >  	struct sk_buff *skb;
> > >  	int copied, err;
> > >  	struct sockaddr_ll *sll;
> > > +	int vnet_hdr_len = 0;
> > >  
> > >  	err = -EINVAL;
> > >  	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
> > > @@ -1451,6 +1552,44 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
> > >  	if (skb == NULL)
> > >  		goto out;
> > >  
> > > +	if (pkt_sk(sk)->vnet_hdr) {
> > > +		struct virtio_net_hdr vnethdr = { 0 };
> > > +
> > > +		vnet_hdr_len = sizeof(vnethdr);
> > > +		if ((len -= vnet_hdr_len) < 0)
> > > +			return -EINVAL;
> > > +
> > > +		if (skb_is_gso(skb)) {
> > > +			struct skb_shared_info *sinfo = skb_shinfo(skb);
> > > +
> > > +			/* This is a hint as to how much should be linear. */
> > > +			vnethdr.hdr_len = skb_headlen(skb);
> > > +			vnethdr.gso_size = sinfo->gso_size;
> > > +			if (sinfo->gso_type & SKB_GSO_TCPV4)
> > > +				vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
> > > +			else if (sinfo->gso_type & SKB_GSO_TCPV6)
> > > +				vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
> > > +			else if (sinfo->gso_type & SKB_GSO_UDP)
> > > +				vnethdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
> > > +			else
> > > +				BUG();
> > 
> > Is there any chance this can get SKB_GSO_FCOE by binding to
> > an appropriate interface?  Maybe we don't want to BUG().
> I could return -EINVAL in that case.
> 
> > 
> > > +			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
> > > +				vnethdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
> > > +		} else
> > > +			vnethdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
> > > +
> > > +		if (skb->ip_summed == CHECKSUM_PARTIAL) {
> > > +			vnethdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
> > > +			vnethdr.csum_start = skb->csum_start - skb_headroom(skb);
> > > +			vnethdr.csum_offset = skb->csum_offset;
> > > +		} /* else everything is zero */
> > > +
> > > +		if (unlikely(memcpy_toiovec(msg->msg_iov, (void *)&vnethdr,
> > > +					    sizeof(vnethdr)))) {
> > > +			return -EFAULT;
> > > +		}
> > > +	}
> > > +
> > >  	/*
> > >  	 *	If the address length field is there to be filled in, we fill
> > >  	 *	it in now.
> > > @@ -1502,7 +1641,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
> > >  	 *	Free or return the buffer as appropriate. Again this
> > >  	 *	hides all the races and re-entrancy issues from us.
> > >  	 */
> > > -	err = (flags&MSG_TRUNC) ? skb->len : copied;
> > > +	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
> > >  
> > >  out_free:
> > >  	skb_free_datagram(sk, skb);
> > > @@ -1826,6 +1965,22 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
> > >  		po->origdev = !!val;
> > >  		return 0;
> > >  	}
> > > +	case PACKET_VNET_HDR:
> > > +	{
> > > +		int val;
> > > +
> > > +		if (sock->type != SOCK_RAW)
> > > +			return -EINVAL;
> > > +		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)	
> > > +			return -EBUSY;
> > 
> > Another way to get a broken ring + vnet hdr configuration
> > would be to enable vnet hdr first and mmap second.
> > I think we need to guard against this as well, by checking vnet_hdr
> > when tx/rx ring is enabled.
> 
> OK. i will add a check when setting PACKET_RX_RING/TX_RING socket
> options. 
> > > +		if (optlen < sizeof(val))
> > > +			return -EINVAL;
> > > +		if (copy_from_user(&val, optval, sizeof(val)))
> > > +			return -EFAULT;
> > > +
> > > +		po->vnet_hdr = !!val;
> > > +		return 0;
> > > +	}
> > >  	default:
> > >  		return -ENOPROTOOPT;
> > >  	}
> > > @@ -1876,6 +2031,13 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
> > >  
> > >  		data = &val;
> > >  		break;
> > > +	case PACKET_VNET_HDR:
> > > +		if (len > sizeof(int))
> > > +			len = sizeof(int);
> > > +		val = po->vnet_hdr;
> > > +
> > > +		data = &val;
> > > +		break;
> > >  #ifdef CONFIG_PACKET_MMAP
> > >  	case PACKET_VERSION:
> > >  		if (len > sizeof(int))
> > > 
> > 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu - Jan. 29, 2010, 8:53 a.m.
On Tue, Jan 26, 2010 at 12:30:19PM -0800, Sridhar Samudrala wrote:
>
> +	if (po->vnet_hdr) {
> +		err = -EINVAL;
> +		if (dev->type != ARPHRD_ETHER)
> +			goto out_unlock;

We shouldn't have Ethernet-specific code in AF_PACKET.  What's
more, just because the device type is Ethernet it doesn't mean
that the packet is going to have an Ethernet header.

Cheers,
Sridhar Samudrala - Jan. 29, 2010, 9:25 p.m.
On Fri, 2010-01-29 at 21:53 +1300, Herbert Xu wrote:
> On Tue, Jan 26, 2010 at 12:30:19PM -0800, Sridhar Samudrala wrote:
> >
> > +	if (po->vnet_hdr) {
> > +		err = -EINVAL;
> > +		if (dev->type != ARPHRD_ETHER)
> > +			goto out_unlock;
> 
> We shouldn't have Ethernet-specific code in AF_PACKET.  What's
> more, just because the device type is Ethernet it doesn't mean
> that the packet is going to have an Ethernet header.

This check is to dis-allow processing of packets with virtio_net_hdr
destined for non-ethernet devices.
Is it OK if i add a check in packet_bind() to not allow binding to
a non-ethernet device when PACKET_VNET_HDR option is set?

I need to figure out a way to set the skb protocol correctly based
on the packet. Any clues?
Michael has posted a prototype patch that addresses this. Do you 
agree with this approach?
   http://lkml.org/lkml/2010/1/6/56

Did you get a chance to look at my other patch that adds a check
for VLAN packets in skb_gso_segment() to address a bug when sending
large VLAN packets from a guest?
  http://thread.gmane.org/gmane.linux.network/150198

Thanks
Sridhar

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu - Jan. 29, 2010, 9:36 p.m.
On Fri, Jan 29, 2010 at 01:25:08PM -0800, Sridhar Samudrala wrote:
> 
> This check is to dis-allow processing of packets with virtio_net_hdr
> destined for non-ethernet devices.
> Is it OK if i add a check in packet_bind() to not allow binding to
> a non-ethernet device when PACKET_VNET_HDR option is set?
> 
> I need to figure out a way to set the skb protocol correctly based
> on the packet. Any clues?

IMHO the skb protocol should be set by whatever is invoking the
sendmsg call.  After all they would know what the L2 header is
and should be able to deduce the protocol correctly.

Adding all this logic into AF_PACKET is just a hack.

Cheers,
Sridhar Samudrala - Jan. 29, 2010, 10:30 p.m.
On 1/29/2010 1:36 PM, Herbert Xu wrote:
> On Fri, Jan 29, 2010 at 01:25:08PM -0800, Sridhar Samudrala wrote:
>    
>> This check is to dis-allow processing of packets with virtio_net_hdr
>> destined for non-ethernet devices.
>> Is it OK if i add a check in packet_bind() to not allow binding to
>> a non-ethernet device when PACKET_VNET_HDR option is set?
>>
>> I need to figure out a way to set the skb protocol correctly based
>> on the packet. Any clues?
>>      
> IMHO the skb protocol should be set by whatever is invoking the
> sendmsg call.  After all they would know what the L2 header is
> and should be able to deduce the protocol correctly.
>    
In this use-case, the component that is calling sendmsg() can be either 
the vhost-net in host kernel
or qemu. They get the buffer including the L2 header from the guest. 
They too need to parse the L2
header to find the protocol.
The packet itself originates from the guest virtio-net driver. but when 
it converts the skb to a virtio-ring
buffer, some of the information in the skb(including protocol and vlan 
info etc) is lost. I guess if we could
re-design, virtio_net_hdr would include this info too. But i think it is 
too late to make any such changes.
So either the callers of sendmsg() or packet_snd() need to parse the L2 
header to get the protocol and
vlan info etc. sendmsg() caller could fill in the protocol, but not the 
vlan info.

Thanks
Sridhar

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu - Jan. 29, 2010, 11:22 p.m.
On Fri, Jan 29, 2010 at 02:30:49PM -0800, Sridhar Samudrala wrote:
>
> In this use-case, the component that is calling sendmsg() can be either  
> the vhost-net in host kernel
> or qemu. They get the buffer including the L2 header from the guest.  
> They too need to parse the L2
> header to find the protocol.

So have them parse the L2 header to get the protocol.  They know
that the packet should contain an Ethernet header, while this
knowledge would be inappropriate for af_packet in general.

This is also why we call eth_type_trans in network drivers as
opposed to the core.

Cheers,

Patch

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 4021d47..aa57a5f 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -46,6 +46,7 @@  struct sockaddr_ll {
 #define PACKET_RESERVE			12
 #define PACKET_TX_RING			13
 #define PACKET_LOSS			14
+#define PACKET_VNET_HDR			15
 
 struct tpacket_stats {
 	unsigned int	tp_packets;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 53633c5..36d5360 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -80,6 +80,8 @@ 
 #include <linux/init.h>
 #include <linux/mutex.h>
 #include <linux/if_vlan.h>
+#include <linux/virtio_net.h>
+#include <linux/if_arp.h>
 
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
@@ -193,7 +195,8 @@  struct packet_sock {
 	struct mutex		pg_vec_lock;
 	unsigned int		running:1,	/* prot_hook is attached*/
 				auxdata:1,
-				origdev:1;
+				origdev:1,
+				vnet_hdr:1;
 	int			ifindex;	/* bound device		*/
 	__be16			num;
 	struct packet_mclist	*mclist;
@@ -1056,6 +1059,30 @@  out:
 }
 #endif
 
+static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
+					       size_t reserve, size_t len,
+					       size_t linear, int noblock,
+					       int *err)
+{
+	struct sk_buff *skb;
+
+	/* Under a page?  Don't bother with paged skb. */
+	if (prepad + len < PAGE_SIZE || !linear)
+		linear = len;
+
+	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
+				   err);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, reserve);
+	skb_put(skb, linear);
+	skb->data_len = len - linear;
+	skb->len += len - linear;
+
+	return skb;
+}
+
 static int packet_snd(struct socket *sock,
 			  struct msghdr *msg, size_t len)
 {
@@ -1066,14 +1093,15 @@  static int packet_snd(struct socket *sock,
 	__be16 proto;
 	unsigned char *addr;
 	int ifindex, err, reserve = 0;
+	struct virtio_net_hdr vnethdr = { 0 };
+	int offset = 0;
+	struct packet_sock *po = pkt_sk(sk);
 
 	/*
 	 *	Get and verify the address.
 	 */
 
 	if (saddr == NULL) {
-		struct packet_sock *po = pkt_sk(sk);
-
 		ifindex	= po->ifindex;
 		proto	= po->num;
 		addr	= NULL;
@@ -1100,25 +1128,52 @@  static int packet_snd(struct socket *sock,
 	if (!(dev->flags & IFF_UP))
 		goto out_unlock;
 
-	err = -EMSGSIZE;
-	if (len > dev->mtu+reserve)
-		goto out_unlock;
+	if (po->vnet_hdr) {
+		err = -EINVAL;
+		if (dev->type != ARPHRD_ETHER)
+			goto out_unlock;
+
+		if (len < sizeof(vnethdr))
+			goto out_unlock;
 
-	skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
-				msg->msg_flags & MSG_DONTWAIT, &err);
+		len -= sizeof(vnethdr);
+
+		err = -EFAULT;
+		if (memcpy_fromiovec((void *)&vnethdr, msg->msg_iov,
+					sizeof(vnethdr))) 
+			goto out_unlock;
+	
+		if ((vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+		    (vnethdr.csum_start + vnethdr.csum_offset + 2 >
+		      vnethdr.hdr_len))
+			vnethdr.hdr_len = vnethdr.csum_start +
+						 vnethdr.csum_offset + 2;
+
+		err = -EINVAL;
+		if (vnethdr.hdr_len > len)
+			goto out_unlock;
+	} else {
+		err = -EMSGSIZE;
+		if (len > dev->mtu+reserve)
+			goto out_unlock;
+	}
+
+	err = -ENOBUFS;
+	skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
+			       LL_RESERVED_SPACE(dev), len, vnethdr.hdr_len,
+			       msg->msg_flags & MSG_DONTWAIT, &err);
 	if (skb == NULL)
 		goto out_unlock;
 
-	skb_reserve(skb, LL_RESERVED_SPACE(dev));
-	skb_reset_network_header(skb);
+	skb_set_network_header(skb, reserve);
 
 	err = -EINVAL;
 	if (sock->type == SOCK_DGRAM &&
-	    dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
+	    (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
 		goto out_free;
 
 	/* Returns -EFAULT on error */
-	err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
+	err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
 	if (err)
 		goto out_free;
 
@@ -1127,6 +1182,51 @@  static int packet_snd(struct socket *sock,
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
 
+	if (po->vnet_hdr) {
+		skb_reset_mac_header(skb);
+		skb->protocol = eth_hdr(skb)->h_proto;
+
+		if (vnethdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+			if (!skb_partial_csum_set(skb, vnethdr.csum_start,
+						  vnethdr.csum_offset)) {
+				err = -EINVAL;
+				goto out_free;
+			}
+		}
+
+		if (vnethdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+			switch (vnethdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+			case VIRTIO_NET_HDR_GSO_TCPV4:
+				skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+				break;
+			case VIRTIO_NET_HDR_GSO_TCPV6:
+				skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+				break;
+			case VIRTIO_NET_HDR_GSO_UDP:
+				skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+				break;
+			default:
+				err = -EINVAL;
+				goto out_free;
+			}
+
+			if (vnethdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
+				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
+
+			skb_shinfo(skb)->gso_size = vnethdr.gso_size;
+			if (skb_shinfo(skb)->gso_size == 0) {
+				err = -EINVAL;
+				goto out_free;
+                	}
+
+			/* Header must be checked, and gso_segs computed. */
+			skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+			skb_shinfo(skb)->gso_segs = 0;
+		}
+
+		len += sizeof(vnethdr);
+	}
+
 	/*
 	 *	Now send it
 	 */
@@ -1420,6 +1520,7 @@  static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 	struct sk_buff *skb;
 	int copied, err;
 	struct sockaddr_ll *sll;
+	int vnet_hdr_len = 0;
 
 	err = -EINVAL;
 	if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
@@ -1451,6 +1552,44 @@  static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 	if (skb == NULL)
 		goto out;
 
+	if (pkt_sk(sk)->vnet_hdr) {
+		struct virtio_net_hdr vnethdr = { 0 };
+
+		vnet_hdr_len = sizeof(vnethdr);
+		if ((len -= vnet_hdr_len) < 0)
+			return -EINVAL;
+
+		if (skb_is_gso(skb)) {
+			struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+			/* This is a hint as to how much should be linear. */
+			vnethdr.hdr_len = skb_headlen(skb);
+			vnethdr.gso_size = sinfo->gso_size;
+			if (sinfo->gso_type & SKB_GSO_TCPV4)
+				vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+			else if (sinfo->gso_type & SKB_GSO_TCPV6)
+				vnethdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+			else if (sinfo->gso_type & SKB_GSO_UDP)
+				vnethdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
+			else
+				BUG();
+			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+				vnethdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+		} else
+			vnethdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			vnethdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+			vnethdr.csum_start = skb->csum_start - skb_headroom(skb);
+			vnethdr.csum_offset = skb->csum_offset;
+		} /* else everything is zero */
+
+		if (unlikely(memcpy_toiovec(msg->msg_iov, (void *)&vnethdr,
+					    sizeof(vnethdr)))) {
+			return -EFAULT;
+		}
+	}
+
 	/*
 	 *	If the address length field is there to be filled in, we fill
 	 *	it in now.
@@ -1502,7 +1641,7 @@  static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 	 *	Free or return the buffer as appropriate. Again this
 	 *	hides all the races and re-entrancy issues from us.
 	 */
-	err = (flags&MSG_TRUNC) ? skb->len : copied;
+	err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
 
 out_free:
 	skb_free_datagram(sk, skb);
@@ -1826,6 +1965,22 @@  packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 		po->origdev = !!val;
 		return 0;
 	}
+	case PACKET_VNET_HDR:
+	{
+		int val;
+
+		if (sock->type != SOCK_RAW)
+			return -EINVAL;
+		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)	
+			return -EBUSY;
+		if (optlen < sizeof(val))
+			return -EINVAL;
+		if (copy_from_user(&val, optval, sizeof(val)))
+			return -EFAULT;
+
+		po->vnet_hdr = !!val;
+		return 0;
+	}
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -1876,6 +2031,13 @@  static int packet_getsockopt(struct socket *sock, int level, int optname,
 
 		data = &val;
 		break;
+	case PACKET_VNET_HDR:
+		if (len > sizeof(int))
+			len = sizeof(int);
+		val = po->vnet_hdr;
+
+		data = &val;
+		break;
 #ifdef CONFIG_PACKET_MMAP
 	case PACKET_VERSION:
 		if (len > sizeof(int))