diff mbox

[V4,5/8] macvtap: macvtap TX zero-copy support

Message ID 1304496893.20660.88.camel@localhost.localdomain
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Shirley Ma May 4, 2011, 8:14 a.m. UTC
Only when buffer size is greater than GOODCOPY_LEN (256), macvtap
enables zero-copy.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---

 drivers/net/macvtap.c |  126 ++++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 115 insertions(+), 11 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Michael S. Tsirkin May 4, 2011, 2:58 p.m. UTC | #1
On Wed, May 04, 2011 at 01:14:53AM -0700, Shirley Ma wrote:
> Only when buffer size is greater than GOODCOPY_LEN (256), macvtap
> enables zero-copy.
> 
> Signed-off-by: Shirley Ma <xma@us.ibm.com>


Looks good. Some thoughts below.

> ---
> 
>  drivers/net/macvtap.c |  126 ++++++++++++++++++++++++++++++++++++++++++++----
>  1 files changed, 115 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> index 6696e56..e8bc5ff 100644
> --- a/drivers/net/macvtap.c
> +++ b/drivers/net/macvtap.c
> @@ -60,6 +60,7 @@ static struct proto macvtap_proto = {
>   */
>  static dev_t macvtap_major;
>  #define MACVTAP_NUM_DEVS 65536
> +#define GOODCOPY_LEN 256

Scope with MACVTAP_ please.

For small packets, is it better to copy in vhost
and skip all the back and forth with callbacks? If yes, does
it make sense to put the constant above in some header
shared with vhost-net?

>  static struct class *macvtap_class;
>  static struct cdev macvtap_cdev;
>  
> @@ -340,6 +341,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
>  {
>  	struct net *net = current->nsproxy->net_ns;
>  	struct net_device *dev = dev_get_by_index(net, iminor(inode));
> +	struct macvlan_dev *vlan = netdev_priv(dev);
>  	struct macvtap_queue *q;
>  	int err;
>  
> @@ -369,6 +371,16 @@ static int macvtap_open(struct inode *inode, struct file *file)
>  	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
>  	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
>  
> +	/*
> +	 * so far only VM uses macvtap, enable zero copy between guest
> +	 * kernel and host kernel when lower device supports high memory
> +	 * DMA
> +	 */
> +	if (vlan) {
> +		if (vlan->lowerdev->features & NETIF_F_ZEROCOPY)
> +			sock_set_flag(&q->sk, SOCK_ZEROCOPY);
> +	}
> +
>  	err = macvtap_set_queue(dev, file, q);
>  	if (err)
>  		sock_put(&q->sk);
> @@ -433,6 +445,80 @@ static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
>  	return skb;
>  }
>  
> +/* set skb frags from iovec, this can move to core network code for reuse */
> +static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
> +				  int offset, size_t count)
> +{
> +	int len = iov_length(from, count) - offset;
> +	int copy = skb_headlen(skb);
> +	int size, offset1 = 0;
> +	int i = 0;
> +	skb_frag_t *f;
> +
> +	/* Skip over from offset */
> +	while (offset >= from->iov_len) {
> +		offset -= from->iov_len;
> +		++from;
> +		--count;
> +	}
> +
> +	/* copy up to skb headlen */
> +	while (copy > 0) {
> +		size = min_t(unsigned int, copy, from->iov_len - offset);
> +		if (copy_from_user(skb->data + offset1, from->iov_base + offset,
> +				   size))
> +			return -EFAULT;
> +		if (copy > size) {
> +			++from;
> +			--count;
> +		}
> +		copy -= size;
> +		offset1 += size;
> +		offset = 0;
> +	}
> +
> +	if (len == offset1)
> +		return 0;
> +
> +	while (count--) {
> +		struct page *page[MAX_SKB_FRAGS];
> +		int num_pages;
> +		unsigned long base;
> +
> +		len = from->iov_len - offset1;
> +		if (!len) {
> +			offset1 = 0;
> +			++from;
> +			continue;
> +		}
> +		base = (unsigned long)from->iov_base + offset1;
> +		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
> +		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
> +		if ((num_pages != size) ||
> +		    (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags))
> +			/* put_page is in skb free */
> +			return -EFAULT;
> +		skb->data_len += len;
> +		skb->len += len;
> +		skb->truesize += len;
> +		while (len) {
> +			f = &skb_shinfo(skb)->frags[i];
> +			f->page = page[i];
> +			f->page_offset = base & ~PAGE_MASK;
> +			f->size = min_t(int, len, PAGE_SIZE - f->page_offset);
> +			skb_shinfo(skb)->nr_frags++;
> +			/* increase sk_wmem_alloc */
> +			atomic_add(f->size, &skb->sk->sk_wmem_alloc);

One thing that gave me pause that we only accound for part of the page
here. I think we should count the whole page, no?

> +			base += f->size;
> +			len -= f->size;
> +			i++;
> +		}
> +		offset1 = 0;
> +		++from;
> +	}
> +	return 0;
> +}
> +
>  /*
>   * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
>   * be shared with the tun/tap driver.
> @@ -515,17 +601,19 @@ static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb,
>  
>  
>  /* Get packet from user space buffer */
> -static ssize_t macvtap_get_user(struct macvtap_queue *q,
> -				const struct iovec *iv, size_t count,
> -				int noblock)
> +static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
> +				const struct iovec *iv, unsigned long total_len,
> +				size_t count, int noblock)
>  {
>  	struct sk_buff *skb;
>  	struct macvlan_dev *vlan;
> -	size_t len = count;
> +	unsigned long len = total_len;
>  	int err;
>  	struct virtio_net_hdr vnet_hdr = { 0 };
>  	int vnet_hdr_len = 0;
> +	int copylen, zerocopy;
>  
> +	zerocopy = sock_flag(&q->sk, SOCK_ZEROCOPY) && (len > GOODCOPY_LEN);
>  	if (q->flags & IFF_VNET_HDR) {
>  		vnet_hdr_len = q->vnet_hdr_sz;
>  
> @@ -552,12 +640,28 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
>  	if (unlikely(len < ETH_HLEN))
>  		goto err;
>  
> -	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len, vnet_hdr.hdr_len,
> -				noblock, &err);
> +	if (zerocopy)
> +		/* There are 256 bytes to be copied in skb, so there is enough
> +		 * room for skb expand head in case it is used.
> +		 * The rest buffer is mapped from userspace.
> +		 */
> +		copylen = GOODCOPY_LEN;

Just curious: where does the number 256 come from?
Also, as long as we are copying, should we care about
alignment?

> +	else
> +		copylen = len;
> +
> +	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen,
> +				vnet_hdr.hdr_len, noblock, &err);
>  	if (!skb)
>  		goto err;
>  
> -	err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len);
> +	if (zerocopy)
> +		err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
> +	else
> +		err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
> +						   len);
> +	if (sock_flag(&q->sk, SOCK_ZEROCOPY))
> +		memcpy(&skb_shinfo(skb)->ubuf, m->msg_control,
> +			sizeof(struct skb_ubuf_info));
>  	if (err)
>  		goto err_kfree;
>  
> @@ -579,7 +683,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
>  		kfree_skb(skb);
>  	rcu_read_unlock_bh();
>  
> -	return count;
> +	return total_len;
>  
>  err_kfree:
>  	kfree_skb(skb);
> @@ -601,8 +705,8 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
>  	ssize_t result = -ENOLINK;
>  	struct macvtap_queue *q = file->private_data;
>  
> -	result = macvtap_get_user(q, iv, iov_length(iv, count),
> -			      file->f_flags & O_NONBLOCK);
> +	result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count,
> +				  file->f_flags & O_NONBLOCK);
>  	return result;
>  }
>  
> @@ -815,7 +919,7 @@ static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock,
>  			   struct msghdr *m, size_t total_len)
>  {
>  	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
> -	return macvtap_get_user(q, m->msg_iov, total_len,
> +	return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen,
>  			    m->msg_flags & MSG_DONTWAIT);
>  }
>  
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma May 4, 2011, 3:37 p.m. UTC | #2
On Wed, 2011-05-04 at 17:58 +0300, Michael S. Tsirkin wrote:
> On Wed, May 04, 2011 at 01:14:53AM -0700, Shirley Ma wrote:
> > Only when buffer size is greater than GOODCOPY_LEN (256), macvtap
> > enables zero-copy.
> > 
> > Signed-off-by: Shirley Ma <xma@us.ibm.com>
> 
> 
> Looks good. Some thoughts below.
> 
> > ---
> > 
> >  drivers/net/macvtap.c |  126
> ++++++++++++++++++++++++++++++++++++++++++++----
> >  1 files changed, 115 insertions(+), 11 deletions(-)
> > 
> > diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> > index 6696e56..e8bc5ff 100644
> > --- a/drivers/net/macvtap.c
> > +++ b/drivers/net/macvtap.c
> > @@ -60,6 +60,7 @@ static struct proto macvtap_proto = {
> >   */
> >  static dev_t macvtap_major;
> >  #define MACVTAP_NUM_DEVS 65536
> > +#define GOODCOPY_LEN 256
> 
> Scope with MACVTAP_ please.
Ok.

> For small packets, is it better to copy in vhost
> and skip all the back and forth with callbacks? If yes, does
> it make sense to put the constant above in some header
> shared with vhost-net?

skb is created in macvtap, the small packet copy is in skb, so I don't
think we can do it in vhost here.

> >  static struct class *macvtap_class;
> >  static struct cdev macvtap_cdev;
> >  
> > @@ -340,6 +341,7 @@ static int macvtap_open(struct inode *inode,
> struct file *file)
> >  {
> >       struct net *net = current->nsproxy->net_ns;
> >       struct net_device *dev = dev_get_by_index(net, iminor(inode));
> > +     struct macvlan_dev *vlan = netdev_priv(dev);
> >       struct macvtap_queue *q;
> >       int err;
> >  
> > @@ -369,6 +371,16 @@ static int macvtap_open(struct inode *inode,
> struct file *file)
> >       q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
> >       q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
> >  
> > +     /*
> > +      * so far only VM uses macvtap, enable zero copy between guest
> > +      * kernel and host kernel when lower device supports high
> memory
> > +      * DMA
> > +      */
> > +     if (vlan) {
> > +             if (vlan->lowerdev->features & NETIF_F_ZEROCOPY)
> > +                     sock_set_flag(&q->sk, SOCK_ZEROCOPY);
> > +     }
> > +
> >       err = macvtap_set_queue(dev, file, q);
> >       if (err)
> >               sock_put(&q->sk);
> > @@ -433,6 +445,80 @@ static inline struct sk_buff
> *macvtap_alloc_skb(struct sock *sk, size_t prepad,
> >       return skb;
> >  }
> >  
> > +/* set skb frags from iovec, this can move to core network code for
> reuse */
> > +static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct
> iovec *from,
> > +                               int offset, size_t count)
> > +{
> > +     int len = iov_length(from, count) - offset;
> > +     int copy = skb_headlen(skb);
> > +     int size, offset1 = 0;
> > +     int i = 0;
> > +     skb_frag_t *f;
> > +
> > +     /* Skip over from offset */
> > +     while (offset >= from->iov_len) {
> > +             offset -= from->iov_len;
> > +             ++from;
> > +             --count;
> > +     }
> > +
> > +     /* copy up to skb headlen */
> > +     while (copy > 0) {
> > +             size = min_t(unsigned int, copy, from->iov_len -
> offset);
> > +             if (copy_from_user(skb->data + offset1, from->iov_base
> + offset,
> > +                                size))
> > +                     return -EFAULT;
> > +             if (copy > size) {
> > +                     ++from;
> > +                     --count;
> > +             }
> > +             copy -= size;
> > +             offset1 += size;
> > +             offset = 0;
> > +     }
> > +
> > +     if (len == offset1)
> > +             return 0;
> > +
> > +     while (count--) {
> > +             struct page *page[MAX_SKB_FRAGS];
> > +             int num_pages;
> > +             unsigned long base;
> > +
> > +             len = from->iov_len - offset1;
> > +             if (!len) {
> > +                     offset1 = 0;
> > +                     ++from;
> > +                     continue;
> > +             }
> > +             base = (unsigned long)from->iov_base + offset1;
> > +             size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >>
> PAGE_SHIFT;
> > +             num_pages = get_user_pages_fast(base, size, 0,
> &page[i]);
> > +             if ((num_pages != size) ||
> > +                 (num_pages > MAX_SKB_FRAGS -
> skb_shinfo(skb)->nr_frags))
> > +                     /* put_page is in skb free */
> > +                     return -EFAULT;
> > +             skb->data_len += len;
> > +             skb->len += len;
> > +             skb->truesize += len;
> > +             while (len) {
> > +                     f = &skb_shinfo(skb)->frags[i];
> > +                     f->page = page[i];
> > +                     f->page_offset = base & ~PAGE_MASK;
> > +                     f->size = min_t(int, len, PAGE_SIZE -
> f->page_offset);
> > +                     skb_shinfo(skb)->nr_frags++;
> > +                     /* increase sk_wmem_alloc */
> > +                     atomic_add(f->size, &skb->sk->sk_wmem_alloc);
> 
> One thing that gave me pause that we only accound for part of the page
> here. I think we should count the whole page, no?

The whole page is pinned, but it might not be for this sock only. 

I think I should change to atomic_add to outside the loop to save cost.

> > +                     base += f->size;
> > +                     len -= f->size;
> > +                     i++;
> > +             }
> > +             offset1 = 0;
> > +             ++from;
> > +     }
> > +     return 0;
> > +}
> > +
> >  /*
> >   * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
> >   * be shared with the tun/tap driver.
> > @@ -515,17 +601,19 @@ static int macvtap_skb_to_vnet_hdr(const
> struct sk_buff *skb,
> >  
> >  
> >  /* Get packet from user space buffer */
> > -static ssize_t macvtap_get_user(struct macvtap_queue *q,
> > -                             const struct iovec *iv, size_t count,
> > -                             int noblock)
> > +static ssize_t macvtap_get_user(struct macvtap_queue *q, struct
> msghdr *m,
> > +                             const struct iovec *iv, unsigned long
> total_len,
> > +                             size_t count, int noblock)
> >  {
> >       struct sk_buff *skb;
> >       struct macvlan_dev *vlan;
> > -     size_t len = count;
> > +     unsigned long len = total_len;
> >       int err;
> >       struct virtio_net_hdr vnet_hdr = { 0 };
> >       int vnet_hdr_len = 0;
> > +     int copylen, zerocopy;
> >  
> > +     zerocopy = sock_flag(&q->sk, SOCK_ZEROCOPY) && (len >
> GOODCOPY_LEN);
> >       if (q->flags & IFF_VNET_HDR) {
> >               vnet_hdr_len = q->vnet_hdr_sz;
> >  
> > @@ -552,12 +640,28 @@ static ssize_t macvtap_get_user(struct
> macvtap_queue *q,
> >       if (unlikely(len < ETH_HLEN))
> >               goto err;
> >  
> > -     skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len,
> vnet_hdr.hdr_len,
> > -                             noblock, &err);
> > +     if (zerocopy)
> > +             /* There are 256 bytes to be copied in skb, so there
> is enough
> > +              * room for skb expand head in case it is used.
> > +              * The rest buffer is mapped from userspace.
> > +              */
> > +             copylen = GOODCOPY_LEN;
> 
> Just curious: where does the number 256 come from?
> Also, as long as we are copying, should we care about
> alignment?

256 makes the size big enough for any skb head expanding.

That's my concern before. But guest should alignment the buffer already,
after moving the pointer 256 bytes. It should still alignment, right?

> > +     else
> > +             copylen = len;
> > +
> > +     skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen,
> > +                             vnet_hdr.hdr_len, noblock, &err);
> >       if (!skb)
> >               goto err;
> >  
> > -     err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
> len);
> > +     if (zerocopy)
> > +             err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len,
> count);
> > +     else
> > +             err = skb_copy_datagram_from_iovec(skb, 0, iv,
> vnet_hdr_len,
> > +                                                len);
> > +     if (sock_flag(&q->sk, SOCK_ZEROCOPY))
> > +             memcpy(&skb_shinfo(skb)->ubuf, m->msg_control,
> > +                     sizeof(struct skb_ubuf_info));
> >       if (err)
> >               goto err_kfree;
> >  
> > @@ -579,7 +683,7 @@ static ssize_t macvtap_get_user(struct
> macvtap_queue *q,
> >               kfree_skb(skb);
> >       rcu_read_unlock_bh();
> >  
> > -     return count;
> > +     return total_len;
> >  
> >  err_kfree:
> >       kfree_skb(skb);
> > @@ -601,8 +705,8 @@ static ssize_t macvtap_aio_write(struct kiocb
> *iocb, const struct iovec *iv,
> >       ssize_t result = -ENOLINK;
> >       struct macvtap_queue *q = file->private_data;
> >  
> > -     result = macvtap_get_user(q, iv, iov_length(iv, count),
> > -                           file->f_flags & O_NONBLOCK);
> > +     result = macvtap_get_user(q, NULL, iv, iov_length(iv, count),
> count,
> > +                               file->f_flags & O_NONBLOCK);
> >       return result;
> >  }
> >  
> > @@ -815,7 +919,7 @@ static int macvtap_sendmsg(struct kiocb *iocb,
> struct socket *sock,
> >                          struct msghdr *m, size_t total_len)
> >  {
> >       struct macvtap_queue *q = container_of(sock, struct
> macvtap_queue, sock);
> > -     return macvtap_get_user(q, m->msg_iov, total_len,
> > +     return macvtap_get_user(q, m, m->msg_iov, total_len,
> m->msg_iovlen,
> >                           m->msg_flags & MSG_DONTWAIT);
> >  }
> >  
> > 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin May 4, 2011, 4:12 p.m. UTC | #3
On Wed, May 04, 2011 at 08:37:29AM -0700, Shirley Ma wrote:
> On Wed, 2011-05-04 at 17:58 +0300, Michael S. Tsirkin wrote:
> > On Wed, May 04, 2011 at 01:14:53AM -0700, Shirley Ma wrote:
> > > Only when buffer size is greater than GOODCOPY_LEN (256), macvtap
> > > enables zero-copy.
> > > 
> > > Signed-off-by: Shirley Ma <xma@us.ibm.com>
> > 
> > 
> > Looks good. Some thoughts below.
> > 
> > > ---
> > > 
> > >  drivers/net/macvtap.c |  126
> > ++++++++++++++++++++++++++++++++++++++++++++----
> > >  1 files changed, 115 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> > > index 6696e56..e8bc5ff 100644
> > > --- a/drivers/net/macvtap.c
> > > +++ b/drivers/net/macvtap.c
> > > @@ -60,6 +60,7 @@ static struct proto macvtap_proto = {
> > >   */
> > >  static dev_t macvtap_major;
> > >  #define MACVTAP_NUM_DEVS 65536
> > > +#define GOODCOPY_LEN 256
> > 
> > Scope with MACVTAP_ please.
> Ok.
> 
> > For small packets, is it better to copy in vhost
> > and skip all the back and forth with callbacks? If yes, does
> > it make sense to put the constant above in some header
> > shared with vhost-net?
> 
> skb is created in macvtap, the small packet copy is in skb, so I don't
> think we can do it in vhost here.

One simple way is to pass NULL instead of the pend pointer
for when we want macvtap to copy unconditionally.
vhost-net will know that packet is copied and can notify user
unconditionally.

I think this will solve the small packet regression you see ... no?

> > >  static struct class *macvtap_class;
> > >  static struct cdev macvtap_cdev;
> > >  
> > > @@ -340,6 +341,7 @@ static int macvtap_open(struct inode *inode,
> > struct file *file)
> > >  {
> > >       struct net *net = current->nsproxy->net_ns;
> > >       struct net_device *dev = dev_get_by_index(net, iminor(inode));
> > > +     struct macvlan_dev *vlan = netdev_priv(dev);
> > >       struct macvtap_queue *q;
> > >       int err;
> > >  
> > > @@ -369,6 +371,16 @@ static int macvtap_open(struct inode *inode,
> > struct file *file)
> > >       q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
> > >       q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
> > >  
> > > +     /*
> > > +      * so far only VM uses macvtap, enable zero copy between guest
> > > +      * kernel and host kernel when lower device supports high
> > memory
> > > +      * DMA
> > > +      */
> > > +     if (vlan) {
> > > +             if (vlan->lowerdev->features & NETIF_F_ZEROCOPY)
> > > +                     sock_set_flag(&q->sk, SOCK_ZEROCOPY);
> > > +     }
> > > +
> > >       err = macvtap_set_queue(dev, file, q);
> > >       if (err)
> > >               sock_put(&q->sk);
> > > @@ -433,6 +445,80 @@ static inline struct sk_buff
> > *macvtap_alloc_skb(struct sock *sk, size_t prepad,
> > >       return skb;
> > >  }
> > >  
> > > +/* set skb frags from iovec, this can move to core network code for
> > reuse */
> > > +static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct
> > iovec *from,
> > > +                               int offset, size_t count)
> > > +{
> > > +     int len = iov_length(from, count) - offset;
> > > +     int copy = skb_headlen(skb);
> > > +     int size, offset1 = 0;
> > > +     int i = 0;
> > > +     skb_frag_t *f;
> > > +
> > > +     /* Skip over from offset */
> > > +     while (offset >= from->iov_len) {
> > > +             offset -= from->iov_len;
> > > +             ++from;
> > > +             --count;
> > > +     }
> > > +
> > > +     /* copy up to skb headlen */
> > > +     while (copy > 0) {
> > > +             size = min_t(unsigned int, copy, from->iov_len -
> > offset);
> > > +             if (copy_from_user(skb->data + offset1, from->iov_base
> > + offset,
> > > +                                size))
> > > +                     return -EFAULT;
> > > +             if (copy > size) {
> > > +                     ++from;
> > > +                     --count;
> > > +             }
> > > +             copy -= size;
> > > +             offset1 += size;
> > > +             offset = 0;
> > > +     }
> > > +
> > > +     if (len == offset1)
> > > +             return 0;
> > > +
> > > +     while (count--) {
> > > +             struct page *page[MAX_SKB_FRAGS];
> > > +             int num_pages;
> > > +             unsigned long base;
> > > +
> > > +             len = from->iov_len - offset1;
> > > +             if (!len) {
> > > +                     offset1 = 0;
> > > +                     ++from;
> > > +                     continue;
> > > +             }
> > > +             base = (unsigned long)from->iov_base + offset1;
> > > +             size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >>
> > PAGE_SHIFT;
> > > +             num_pages = get_user_pages_fast(base, size, 0,
> > &page[i]);
> > > +             if ((num_pages != size) ||
> > > +                 (num_pages > MAX_SKB_FRAGS -
> > skb_shinfo(skb)->nr_frags))
> > > +                     /* put_page is in skb free */
> > > +                     return -EFAULT;
> > > +             skb->data_len += len;
> > > +             skb->len += len;
> > > +             skb->truesize += len;
> > > +             while (len) {
> > > +                     f = &skb_shinfo(skb)->frags[i];
> > > +                     f->page = page[i];
> > > +                     f->page_offset = base & ~PAGE_MASK;
> > > +                     f->size = min_t(int, len, PAGE_SIZE -
> > f->page_offset);
> > > +                     skb_shinfo(skb)->nr_frags++;
> > > +                     /* increase sk_wmem_alloc */
> > > +                     atomic_add(f->size, &skb->sk->sk_wmem_alloc);
> > 
> > One thing that gave me pause that we only accound for part of the page
> > here. I think we should count the whole page, no?
> 
> The whole page is pinned, but it might not be for this sock only. 

Right, but worst-case it is, so I think we should stay on the safe side
and limit what the user can do.

> I think I should change to atomic_add to outside the loop to save cost.

Sure, good idea.

> > > +                     base += f->size;
> > > +                     len -= f->size;
> > > +                     i++;
> > > +             }
> > > +             offset1 = 0;
> > > +             ++from;
> > > +     }
> > > +     return 0;
> > > +}
> > > +
> > >  /*
> > >   * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
> > >   * be shared with the tun/tap driver.
> > > @@ -515,17 +601,19 @@ static int macvtap_skb_to_vnet_hdr(const
> > struct sk_buff *skb,
> > >  
> > >  
> > >  /* Get packet from user space buffer */
> > > -static ssize_t macvtap_get_user(struct macvtap_queue *q,
> > > -                             const struct iovec *iv, size_t count,
> > > -                             int noblock)
> > > +static ssize_t macvtap_get_user(struct macvtap_queue *q, struct
> > msghdr *m,
> > > +                             const struct iovec *iv, unsigned long
> > total_len,
> > > +                             size_t count, int noblock)
> > >  {
> > >       struct sk_buff *skb;
> > >       struct macvlan_dev *vlan;
> > > -     size_t len = count;
> > > +     unsigned long len = total_len;
> > >       int err;
> > >       struct virtio_net_hdr vnet_hdr = { 0 };
> > >       int vnet_hdr_len = 0;
> > > +     int copylen, zerocopy;
> > >  
> > > +     zerocopy = sock_flag(&q->sk, SOCK_ZEROCOPY) && (len >
> > GOODCOPY_LEN);
> > >       if (q->flags & IFF_VNET_HDR) {
> > >               vnet_hdr_len = q->vnet_hdr_sz;
> > >  
> > > @@ -552,12 +640,28 @@ static ssize_t macvtap_get_user(struct
> > macvtap_queue *q,
> > >       if (unlikely(len < ETH_HLEN))
> > >               goto err;
> > >  
> > > -     skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len,
> > vnet_hdr.hdr_len,
> > > -                             noblock, &err);
> > > +     if (zerocopy)
> > > +             /* There are 256 bytes to be copied in skb, so there
> > is enough
> > > +              * room for skb expand head in case it is used.
> > > +              * The rest buffer is mapped from userspace.
> > > +              */
> > > +             copylen = GOODCOPY_LEN;
> > 
> > Just curious: where does the number 256 come from?
> > Also, as long as we are copying, should we care about
> > alignment?
> 
> 256 makes the size big enough for any skb head expanding.
> 
> That's my concern before.

I'm not sure I understand.
Could you tell me how do we know 256 is big enough for any skb head
expanding please?

> But guest should alignment the buffer already,
> after moving the pointer 256 bytes. It should still alignment, right?

I mean in the host. But whatever, it's not that important at this point.

> > > +     else
> > > +             copylen = len;
> > > +
> > > +     skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen,
> > > +                             vnet_hdr.hdr_len, noblock, &err);
> > >       if (!skb)
> > >               goto err;
> > >  
> > > -     err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
> > len);
> > > +     if (zerocopy)
> > > +             err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len,
> > count);
> > > +     else
> > > +             err = skb_copy_datagram_from_iovec(skb, 0, iv,
> > vnet_hdr_len,
> > > +                                                len);
> > > +     if (sock_flag(&q->sk, SOCK_ZEROCOPY))
> > > +             memcpy(&skb_shinfo(skb)->ubuf, m->msg_control,
> > > +                     sizeof(struct skb_ubuf_info));
> > >       if (err)
> > >               goto err_kfree;
> > >  
> > > @@ -579,7 +683,7 @@ static ssize_t macvtap_get_user(struct
> > macvtap_queue *q,
> > >               kfree_skb(skb);
> > >       rcu_read_unlock_bh();
> > >  
> > > -     return count;
> > > +     return total_len;
> > >  
> > >  err_kfree:
> > >       kfree_skb(skb);
> > > @@ -601,8 +705,8 @@ static ssize_t macvtap_aio_write(struct kiocb
> > *iocb, const struct iovec *iv,
> > >       ssize_t result = -ENOLINK;
> > >       struct macvtap_queue *q = file->private_data;
> > >  
> > > -     result = macvtap_get_user(q, iv, iov_length(iv, count),
> > > -                           file->f_flags & O_NONBLOCK);
> > > +     result = macvtap_get_user(q, NULL, iv, iov_length(iv, count),
> > count,
> > > +                               file->f_flags & O_NONBLOCK);
> > >       return result;
> > >  }
> > >  
> > > @@ -815,7 +919,7 @@ static int macvtap_sendmsg(struct kiocb *iocb,
> > struct socket *sock,
> > >                          struct msghdr *m, size_t total_len)
> > >  {
> > >       struct macvtap_queue *q = container_of(sock, struct
> > macvtap_queue, sock);
> > > -     return macvtap_get_user(q, m->msg_iov, total_len,
> > > +     return macvtap_get_user(q, m, m->msg_iov, total_len,
> > m->msg_iovlen,
> > >                           m->msg_flags & MSG_DONTWAIT);
> > >  }
> > >  
> > > 
> > 
> > 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin May 4, 2011, 4:14 p.m. UTC | #4
On Wed, May 04, 2011 at 08:37:29AM -0700, Shirley Ma wrote:
> On Wed, 2011-05-04 at 17:58 +0300, Michael S. Tsirkin wrote:
> > On Wed, May 04, 2011 at 01:14:53AM -0700, Shirley Ma wrote:
> > > Only when buffer size is greater than GOODCOPY_LEN (256), macvtap
> > > enables zero-copy.
> > > 
> > > Signed-off-by: Shirley Ma <xma@us.ibm.com>
> > 
> > 
> > Looks good. Some thoughts below.
> > 
> > > ---
> > > 
> > >  drivers/net/macvtap.c |  126
> > ++++++++++++++++++++++++++++++++++++++++++++----
> > >  1 files changed, 115 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> > > index 6696e56..e8bc5ff 100644
> > > --- a/drivers/net/macvtap.c
> > > +++ b/drivers/net/macvtap.c
> > > @@ -60,6 +60,7 @@ static struct proto macvtap_proto = {
> > >   */
> > >  static dev_t macvtap_major;
> > >  #define MACVTAP_NUM_DEVS 65536
> > > +#define GOODCOPY_LEN 256
> > 
> > Scope with MACVTAP_ please.
> Ok.
> 
> > For small packets, is it better to copy in vhost
> > and skip all the back and forth with callbacks? If yes, does
> > it make sense to put the constant above in some header
> > shared with vhost-net?
> 
> skb is created in macvtap, the small packet copy is in skb, so I don't
> think we can do it in vhost here.

BTW this is not very important, it might or might not
result in some speedup. Let's focus on getting it working
right.
Shirley Ma May 4, 2011, 4:25 p.m. UTC | #5
On Wed, 2011-05-04 at 19:12 +0300, Michael S. Tsirkin wrote:
> On Wed, May 04, 2011 at 08:37:29AM -0700, Shirley Ma wrote:
> > On Wed, 2011-05-04 at 17:58 +0300, Michael S. Tsirkin wrote:
> > > On Wed, May 04, 2011 at 01:14:53AM -0700, Shirley Ma wrote:
> > > > Only when buffer size is greater than GOODCOPY_LEN (256),
> macvtap
> > > > enables zero-copy.
> > > > 
> > > > Signed-off-by: Shirley Ma <xma@us.ibm.com>
> > > 
> > > 
> > > Looks good. Some thoughts below.
> > > 
> > > > ---
> > > > 
> > > >  drivers/net/macvtap.c |  126
> > > ++++++++++++++++++++++++++++++++++++++++++++----
> > > >  1 files changed, 115 insertions(+), 11 deletions(-)
> > > > 
> > > > diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> > > > index 6696e56..e8bc5ff 100644
> > > > --- a/drivers/net/macvtap.c
> > > > +++ b/drivers/net/macvtap.c
> > > > @@ -60,6 +60,7 @@ static struct proto macvtap_proto = {
> > > >   */
> > > >  static dev_t macvtap_major;
> > > >  #define MACVTAP_NUM_DEVS 65536
> > > > +#define GOODCOPY_LEN 256
> > > 
> > > Scope with MACVTAP_ please.
> > Ok.
> > 
> > > For small packets, is it better to copy in vhost
> > > and skip all the back and forth with callbacks? If yes, does
> > > it make sense to put the constant above in some header
> > > shared with vhost-net?
> > 
> > skb is created in macvtap, the small packet copy is in skb, so I
> don't
> > think we can do it in vhost here.
> 
> One simple way is to pass NULL instead of the pend pointer
> for when we want macvtap to copy unconditionally.
> vhost-net will know that packet is copied and can notify user
> unconditionally.
> 
> I think this will solve the small packet regression you see ... no?

I can certainly test it out. The small packet regression seems from more
guest exits. This patch doesn't do anything to cause more guest exits,
but it did speed up sendmsg() path.

> > > >  static struct class *macvtap_class;
> > > >  static struct cdev macvtap_cdev;
> > > >  
> > > > @@ -340,6 +341,7 @@ static int macvtap_open(struct inode *inode,
> > > struct file *file)
> > > >  {
> > > >       struct net *net = current->nsproxy->net_ns;
> > > >       struct net_device *dev = dev_get_by_index(net,
> iminor(inode));
> > > > +     struct macvlan_dev *vlan = netdev_priv(dev);
> > > >       struct macvtap_queue *q;
> > > >       int err;
> > > >  
> > > > @@ -369,6 +371,16 @@ static int macvtap_open(struct inode
> *inode,
> > > struct file *file)
> > > >       q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
> > > >       q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
> > > >  
> > > > +     /*
> > > > +      * so far only VM uses macvtap, enable zero copy between
> guest
> > > > +      * kernel and host kernel when lower device supports high
> > > memory
> > > > +      * DMA
> > > > +      */
> > > > +     if (vlan) {
> > > > +             if (vlan->lowerdev->features & NETIF_F_ZEROCOPY)
> > > > +                     sock_set_flag(&q->sk, SOCK_ZEROCOPY);
> > > > +     }
> > > > +
> > > >       err = macvtap_set_queue(dev, file, q);
> > > >       if (err)
> > > >               sock_put(&q->sk);
> > > > @@ -433,6 +445,80 @@ static inline struct sk_buff
> > > *macvtap_alloc_skb(struct sock *sk, size_t prepad,
> > > >       return skb;
> > > >  }
> > > >  
> > > > +/* set skb frags from iovec, this can move to core network code
> for
> > > reuse */
> > > > +static int zerocopy_sg_from_iovec(struct sk_buff *skb, const
> struct
> > > iovec *from,
> > > > +                               int offset, size_t count)
> > > > +{
> > > > +     int len = iov_length(from, count) - offset;
> > > > +     int copy = skb_headlen(skb);
> > > > +     int size, offset1 = 0;
> > > > +     int i = 0;
> > > > +     skb_frag_t *f;
> > > > +
> > > > +     /* Skip over from offset */
> > > > +     while (offset >= from->iov_len) {
> > > > +             offset -= from->iov_len;
> > > > +             ++from;
> > > > +             --count;
> > > > +     }
> > > > +
> > > > +     /* copy up to skb headlen */
> > > > +     while (copy > 0) {
> > > > +             size = min_t(unsigned int, copy, from->iov_len -
> > > offset);
> > > > +             if (copy_from_user(skb->data + offset1,
> from->iov_base
> > > + offset,
> > > > +                                size))
> > > > +                     return -EFAULT;
> > > > +             if (copy > size) {
> > > > +                     ++from;
> > > > +                     --count;
> > > > +             }
> > > > +             copy -= size;
> > > > +             offset1 += size;
> > > > +             offset = 0;
> > > > +     }
> > > > +
> > > > +     if (len == offset1)
> > > > +             return 0;
> > > > +
> > > > +     while (count--) {
> > > > +             struct page *page[MAX_SKB_FRAGS];
> > > > +             int num_pages;
> > > > +             unsigned long base;
> > > > +
> > > > +             len = from->iov_len - offset1;
> > > > +             if (!len) {
> > > > +                     offset1 = 0;
> > > > +                     ++from;
> > > > +                     continue;
> > > > +             }
> > > > +             base = (unsigned long)from->iov_base + offset1;
> > > > +             size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >>
> > > PAGE_SHIFT;
> > > > +             num_pages = get_user_pages_fast(base, size, 0,
> > > &page[i]);
> > > > +             if ((num_pages != size) ||
> > > > +                 (num_pages > MAX_SKB_FRAGS -
> > > skb_shinfo(skb)->nr_frags))
> > > > +                     /* put_page is in skb free */
> > > > +                     return -EFAULT;
> > > > +             skb->data_len += len;
> > > > +             skb->len += len;
> > > > +             skb->truesize += len;
> > > > +             while (len) {
> > > > +                     f = &skb_shinfo(skb)->frags[i];
> > > > +                     f->page = page[i];
> > > > +                     f->page_offset = base & ~PAGE_MASK;
> > > > +                     f->size = min_t(int, len, PAGE_SIZE -
> > > f->page_offset);
> > > > +                     skb_shinfo(skb)->nr_frags++;
> > > > +                     /* increase sk_wmem_alloc */
> > > > +                     atomic_add(f->size,
> &skb->sk->sk_wmem_alloc);
> > > 
> > > One thing that gave me pause that we only accound for part of the
> page
> > > here. I think we should count the whole page, no?
> > 
> > The whole page is pinned, but it might not be for this sock only. 
> 
> Right, but worst-case it is, so I think we should stay on the safe
> side
> and limit what the user can do.
Ok, let me try this out.

> > I think I should change to atomic_add to outside the loop to save
> cost.
> 
> Sure, good idea.
> 
> > > > +                     base += f->size;
> > > > +                     len -= f->size;
> > > > +                     i++;
> > > > +             }
> > > > +             offset1 = 0;
> > > > +             ++from;
> > > > +     }
> > > > +     return 0;
> > > > +}
> > > > +
> > > >  /*
> > > >   * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
> > > >   * be shared with the tun/tap driver.
> > > > @@ -515,17 +601,19 @@ static int macvtap_skb_to_vnet_hdr(const
> > > struct sk_buff *skb,
> > > >  
> > > >  
> > > >  /* Get packet from user space buffer */
> > > > -static ssize_t macvtap_get_user(struct macvtap_queue *q,
> > > > -                             const struct iovec *iv, size_t
> count,
> > > > -                             int noblock)
> > > > +static ssize_t macvtap_get_user(struct macvtap_queue *q, struct
> > > msghdr *m,
> > > > +                             const struct iovec *iv, unsigned
> long
> > > total_len,
> > > > +                             size_t count, int noblock)
> > > >  {
> > > >       struct sk_buff *skb;
> > > >       struct macvlan_dev *vlan;
> > > > -     size_t len = count;
> > > > +     unsigned long len = total_len;
> > > >       int err;
> > > >       struct virtio_net_hdr vnet_hdr = { 0 };
> > > >       int vnet_hdr_len = 0;
> > > > +     int copylen, zerocopy;
> > > >  
> > > > +     zerocopy = sock_flag(&q->sk, SOCK_ZEROCOPY) && (len >
> > > GOODCOPY_LEN);
> > > >       if (q->flags & IFF_VNET_HDR) {
> > > >               vnet_hdr_len = q->vnet_hdr_sz;
> > > >  
> > > > @@ -552,12 +640,28 @@ static ssize_t macvtap_get_user(struct
> > > macvtap_queue *q,
> > > >       if (unlikely(len < ETH_HLEN))
> > > >               goto err;
> > > >  
> > > > -     skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len,
> > > vnet_hdr.hdr_len,
> > > > -                             noblock, &err);
> > > > +     if (zerocopy)
> > > > +             /* There are 256 bytes to be copied in skb, so
> there
> > > is enough
> > > > +              * room for skb expand head in case it is used.
> > > > +              * The rest buffer is mapped from userspace.
> > > > +              */
> > > > +             copylen = GOODCOPY_LEN;
> > > 
> > > Just curious: where does the number 256 come from?
> > > Also, as long as we are copying, should we care about
> > > alignment?
> > 
> > 256 makes the size big enough for any skb head expanding.
> > 
> > That's my concern before.
> 
> I'm not sure I understand.
> Could you tell me how do we know 256 is big enough for any skb head
> expanding please?
I meant to make sure in any case all headers will be in skb, not in
frags.

> > But guest should alignment the buffer already,
> > after moving the pointer 256 bytes. It should still alignment,
> right?
> 
> I mean in the host. But whatever, it's not that important at this
> point.
> 
> > > > +     else
> > > > +             copylen = len;
> > > > +
> > > > +     skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen,
> > > > +                             vnet_hdr.hdr_len, noblock, &err);
> > > >       if (!skb)
> > > >               goto err;
> > > >  
> > > > -     err = skb_copy_datagram_from_iovec(skb, 0, iv,
> vnet_hdr_len,
> > > len);
> > > > +     if (zerocopy)
> > > > +             err = zerocopy_sg_from_iovec(skb, iv,
> vnet_hdr_len,
> > > count);
> > > > +     else
> > > > +             err = skb_copy_datagram_from_iovec(skb, 0, iv,
> > > vnet_hdr_len,
> > > > +                                                len);
> > > > +     if (sock_flag(&q->sk, SOCK_ZEROCOPY))
> > > > +             memcpy(&skb_shinfo(skb)->ubuf, m->msg_control,
> > > > +                     sizeof(struct skb_ubuf_info));
> > > >       if (err)
> > > >               goto err_kfree;
> > > >  
> > > > @@ -579,7 +683,7 @@ static ssize_t macvtap_get_user(struct
> > > macvtap_queue *q,
> > > >               kfree_skb(skb);
> > > >       rcu_read_unlock_bh();
> > > >  
> > > > -     return count;
> > > > +     return total_len;
> > > >  
> > > >  err_kfree:
> > > >       kfree_skb(skb);
> > > > @@ -601,8 +705,8 @@ static ssize_t macvtap_aio_write(struct
> kiocb
> > > *iocb, const struct iovec *iv,
> > > >       ssize_t result = -ENOLINK;
> > > >       struct macvtap_queue *q = file->private_data;
> > > >  
> > > > -     result = macvtap_get_user(q, iv, iov_length(iv, count),
> > > > -                           file->f_flags & O_NONBLOCK);
> > > > +     result = macvtap_get_user(q, NULL, iv, iov_length(iv,
> count),
> > > count,
> > > > +                               file->f_flags & O_NONBLOCK);
> > > >       return result;
> > > >  }
> > > >  
> > > > @@ -815,7 +919,7 @@ static int macvtap_sendmsg(struct kiocb
> *iocb,
> > > struct socket *sock,
> > > >                          struct msghdr *m, size_t total_len)
> > > >  {
> > > >       struct macvtap_queue *q = container_of(sock, struct
> > > macvtap_queue, sock);
> > > > -     return macvtap_get_user(q, m->msg_iov, total_len,
> > > > +     return macvtap_get_user(q, m, m->msg_iov, total_len,
> > > m->msg_iovlen,
> > > >                           m->msg_flags & MSG_DONTWAIT);
> > > >  }
> > > >  
> > > > 
> > > 
> > > 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma May 5, 2011, 8:59 p.m. UTC | #6
On Wed, 2011-05-04 at 16:13 -0700, Sridhar Samudrala wrote:
> On Wed, 2011-05-04 at 01:14 -0700, Shirley Ma wrote: 
> > Only when buffer size is greater than GOODCOPY_LEN (256), macvtap
> > enables zero-copy.
> > 
> > Signed-off-by: Shirley Ma <xma@us.ibm.com>
> > ---
> > 
> >  drivers/net/macvtap.c |  126 ++++++++++++++++++++++++++++++++++++++++++++----
> >  1 files changed, 115 insertions(+), 11 deletions(-)
> > 
> > diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
> > index 6696e56..e8bc5ff 100644
> > --- a/drivers/net/macvtap.c
> > +++ b/drivers/net/macvtap.c
> > @@ -60,6 +60,7 @@ static struct proto macvtap_proto = {
> >   */
> >  static dev_t macvtap_major;
> >  #define MACVTAP_NUM_DEVS 65536
> > +#define GOODCOPY_LEN 256
> >  static struct class *macvtap_class;
> >  static struct cdev macvtap_cdev;
> > 
> > @@ -340,6 +341,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
> >  {
> >  	struct net *net = current->nsproxy->net_ns;
> >  	struct net_device *dev = dev_get_by_index(net, iminor(inode));
> > +	struct macvlan_dev *vlan = netdev_priv(dev);
> >  	struct macvtap_queue *q;
> >  	int err;
> > 
> > @@ -369,6 +371,16 @@ static int macvtap_open(struct inode *inode, struct file *file)
> >  	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
> >  	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
> > 
> > +	/*
> > +	 * so far only VM uses macvtap, enable zero copy between guest
> > +	 * kernel and host kernel when lower device supports high memory
> > +	 * DMA
> > +	 */
> Instead of VM uses macvtap, it makes it more clear to say  'KVM virtio
> uses macvtap for guest networking'.
> Also you are checking if lower device allows zerocopy, not high dma.

Sure.

>  
> > +	if (vlan) {
> > +		if (vlan->lowerdev->features & NETIF_F_ZEROCOPY)
> > +			sock_set_flag(&q->sk, SOCK_ZEROCOPY);
> > +	}
> > +
> 
> Do we need to check for vlan? If required, both if conditions can be
> combined. 

Yes, I will run more test before enabling it.

> > 	err = macvtap_set_queue(dev, file, q);
> >  	if (err)
> >  		sock_put(&q->sk);
> > @@ -433,6 +445,80 @@ static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
> >  	return skb;
> >  }
> > 
> > +/* set skb frags from iovec, this can move to core network code for reuse */
> > +static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
> > +				  int offset, size_t count)
> > +{
> > +	int len = iov_length(from, count) - offset;
> > +	int copy = skb_headlen(skb);
> > +	int size, offset1 = 0;
> > +	int i = 0;
> > +	skb_frag_t *f;
> > +
> > +	/* Skip over from offset */
> > +	while (offset >= from->iov_len) {
> > +		offset -= from->iov_len;
> > +		++from;
> > +		--count;
> > +	}
> > +
> > +	/* copy up to skb headlen */
> > +	while (copy > 0) {
> > +		size = min_t(unsigned int, copy, from->iov_len - offset);
> > +		if (copy_from_user(skb->data + offset1, from->iov_base + offset,
> > +				   size))
> > +			return -EFAULT;
> > +		if (copy > size) {
> > +			++from;
> > +			--count;
> > +		}
> > +		copy -= size;
> > +		offset1 += size;
> > +		offset = 0;
> > +	}
> 
> I think you need to check for count reaching zero in the above 2 while
> loops.

Doesn't hurt to check count here, in this case, caller has
copylen/len/offset less than count/len already.

> > +
> > +	if (len == offset1)
> > +		return 0;
> > +
> > +	while (count--) {
> > +		struct page *page[MAX_SKB_FRAGS];
> > +		int num_pages;
> > +		unsigned long base;
> > +
> > +		len = from->iov_len - offset1;
> > +		if (!len) {
> > +			offset1 = 0;
> > +			++from;
> > +			continue;
> > +		}
> > +		base = (unsigned long)from->iov_base + offset1;
> > +		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
> > +		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
> > +		if ((num_pages != size) ||
> > +		    (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags))
> > +			/* put_page is in skb free */
> > +			return -EFAULT;
> > +		skb->data_len += len;
> > +		skb->len += len;
> > +		skb->truesize += len;
> > +		while (len) {
> > +			f = &skb_shinfo(skb)->frags[i];
> > +			f->page = page[i];
> > +			f->page_offset = base & ~PAGE_MASK;
> > +			f->size = min_t(int, len, PAGE_SIZE - f->page_offset);
> > +			skb_shinfo(skb)->nr_frags++;
> > +			/* increase sk_wmem_alloc */
> > +			atomic_add(f->size, &skb->sk->sk_wmem_alloc);
> > +			base += f->size;
> > +			len -= f->size;
> > +			i++;
> > +		}
> > +		offset1 = 0;
> > +		++from;
> > +	}
> > +	return 0;
> > +}
> > +
> >  /*
> >   * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
> >   * be shared with the tun/tap driver.
> > @@ -515,17 +601,19 @@ static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb,
> > 
> > 
> >  /* Get packet from user space buffer */
> > -static ssize_t macvtap_get_user(struct macvtap_queue *q,
> > -				const struct iovec *iv, size_t count,
> > -				int noblock)
> > +static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
> > +				const struct iovec *iv, unsigned long total_len,
> > +				size_t count, int noblock)
> >  {
> >  	struct sk_buff *skb;
> >  	struct macvlan_dev *vlan;
> > -	size_t len = count;
> > +	unsigned long len = total_len;
> >  	int err;
> >  	struct virtio_net_hdr vnet_hdr = { 0 };
> >  	int vnet_hdr_len = 0;
> > +	int copylen, zerocopy;
> zerocopy can be boolean 

Yes, I have changed it in V5.

> > +	zerocopy = sock_flag(&q->sk, SOCK_ZEROCOPY) && (len > GOODCOPY_LEN);
> >  	if (q->flags & IFF_VNET_HDR) {
> >  		vnet_hdr_len = q->vnet_hdr_sz;
> > 
> > @@ -552,12 +640,28 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
> >  	if (unlikely(len < ETH_HLEN))
> >  		goto err;
> > 
> > -	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len, vnet_hdr.hdr_len,
> > -				noblock, &err);
> > +	if (zerocopy)
> > +		/* There are 256 bytes to be copied in skb, so there is enough
> > +		 * room for skb expand head in case it is used.
> > +		 * The rest buffer is mapped from userspace.
> > +		 */
> > +		copylen = GOODCOPY_LEN;
> > +	else
> > +		copylen = len;
> > +
> > +	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen,
> > +				vnet_hdr.hdr_len, noblock, &err);
> >  	if (!skb)
> >  		goto err;
> > 
> > -	err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len);
> > +	if (zerocopy)
> > +		err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
> > +	else
> > +		err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
> > +						   len);
> > +	if (sock_flag(&q->sk, SOCK_ZEROCOPY))
> > +		memcpy(&skb_shinfo(skb)->ubuf, m->msg_control,
> > +			sizeof(struct skb_ubuf_info));
> Why are you doing a separate check for this memcpy? Is this required
> if the len < 256? 

Yes. But I have changed it V5 based on MST's suggestion to leave copied
buffer maintain in vhost.

> > 	if (err)
> >  		goto err_kfree;
> > 
> > @@ -579,7 +683,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
> >  		kfree_skb(skb);
> >  	rcu_read_unlock_bh();
> > 
> > -	return count;
> > +	return total_len;
> > 
> >  err_kfree:
> >  	kfree_skb(skb);
> > @@ -601,8 +705,8 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
> >  	ssize_t result = -ENOLINK;
> >  	struct macvtap_queue *q = file->private_data;
> > 
> > -	result = macvtap_get_user(q, iv, iov_length(iv, count),
> > -			      file->f_flags & O_NONBLOCK);
> > +	result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count,
> > +				  file->f_flags & O_NONBLOCK);
> >  	return result;
> >  }
> > 
> > @@ -815,7 +919,7 @@ static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock,
> >  			   struct msghdr *m, size_t total_len)
> >  {
> >  	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
> > -	return macvtap_get_user(q, m->msg_iov, total_len,
> > +	return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen,
> >  			    m->msg_flags & MSG_DONTWAIT);
> >  }
> > 
> > 
> > 
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 6696e56..e8bc5ff 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -60,6 +60,7 @@  static struct proto macvtap_proto = {
  */
 static dev_t macvtap_major;
 #define MACVTAP_NUM_DEVS 65536
+#define GOODCOPY_LEN 256
 static struct class *macvtap_class;
 static struct cdev macvtap_cdev;
 
@@ -340,6 +341,7 @@  static int macvtap_open(struct inode *inode, struct file *file)
 {
 	struct net *net = current->nsproxy->net_ns;
 	struct net_device *dev = dev_get_by_index(net, iminor(inode));
+	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct macvtap_queue *q;
 	int err;
 
@@ -369,6 +371,16 @@  static int macvtap_open(struct inode *inode, struct file *file)
 	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
 	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
 
+	/*
+	 * so far only VM uses macvtap, enable zero copy between guest
+	 * kernel and host kernel when lower device supports high memory
+	 * DMA
+	 */
+	if (vlan) {
+		if (vlan->lowerdev->features & NETIF_F_ZEROCOPY)
+			sock_set_flag(&q->sk, SOCK_ZEROCOPY);
+	}
+
 	err = macvtap_set_queue(dev, file, q);
 	if (err)
 		sock_put(&q->sk);
@@ -433,6 +445,80 @@  static inline struct sk_buff *macvtap_alloc_skb(struct sock *sk, size_t prepad,
 	return skb;
 }
 
+/* set skb frags from iovec, this can move to core network code for reuse */
+static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
+				  int offset, size_t count)
+{
+	int len = iov_length(from, count) - offset;
+	int copy = skb_headlen(skb);
+	int size, offset1 = 0;
+	int i = 0;
+	skb_frag_t *f;
+
+	/* Skip over from offset */
+	while (offset >= from->iov_len) {
+		offset -= from->iov_len;
+		++from;
+		--count;
+	}
+
+	/* copy up to skb headlen */
+	while (copy > 0) {
+		size = min_t(unsigned int, copy, from->iov_len - offset);
+		if (copy_from_user(skb->data + offset1, from->iov_base + offset,
+				   size))
+			return -EFAULT;
+		if (copy > size) {
+			++from;
+			--count;
+		}
+		copy -= size;
+		offset1 += size;
+		offset = 0;
+	}
+
+	if (len == offset1)
+		return 0;
+
+	while (count--) {
+		struct page *page[MAX_SKB_FRAGS];
+		int num_pages;
+		unsigned long base;
+
+		len = from->iov_len - offset1;
+		if (!len) {
+			offset1 = 0;
+			++from;
+			continue;
+		}
+		base = (unsigned long)from->iov_base + offset1;
+		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
+		if ((num_pages != size) ||
+		    (num_pages > MAX_SKB_FRAGS - skb_shinfo(skb)->nr_frags))
+			/* put_page is in skb free */
+			return -EFAULT;
+		skb->data_len += len;
+		skb->len += len;
+		skb->truesize += len;
+		while (len) {
+			f = &skb_shinfo(skb)->frags[i];
+			f->page = page[i];
+			f->page_offset = base & ~PAGE_MASK;
+			f->size = min_t(int, len, PAGE_SIZE - f->page_offset);
+			skb_shinfo(skb)->nr_frags++;
+			/* increase sk_wmem_alloc */
+			atomic_add(f->size, &skb->sk->sk_wmem_alloc);
+			base += f->size;
+			len -= f->size;
+			i++;
+		}
+		offset1 = 0;
+		++from;
+	}
+	return 0;
+}
+
 /*
  * macvtap_skb_from_vnet_hdr and macvtap_skb_to_vnet_hdr should
  * be shared with the tun/tap driver.
@@ -515,17 +601,19 @@  static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb,
 
 
 /* Get packet from user space buffer */
-static ssize_t macvtap_get_user(struct macvtap_queue *q,
-				const struct iovec *iv, size_t count,
-				int noblock)
+static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
+				const struct iovec *iv, unsigned long total_len,
+				size_t count, int noblock)
 {
 	struct sk_buff *skb;
 	struct macvlan_dev *vlan;
-	size_t len = count;
+	unsigned long len = total_len;
 	int err;
 	struct virtio_net_hdr vnet_hdr = { 0 };
 	int vnet_hdr_len = 0;
+	int copylen, zerocopy;
 
+	zerocopy = sock_flag(&q->sk, SOCK_ZEROCOPY) && (len > GOODCOPY_LEN);
 	if (q->flags & IFF_VNET_HDR) {
 		vnet_hdr_len = q->vnet_hdr_sz;
 
@@ -552,12 +640,28 @@  static ssize_t macvtap_get_user(struct macvtap_queue *q,
 	if (unlikely(len < ETH_HLEN))
 		goto err;
 
-	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, len, vnet_hdr.hdr_len,
-				noblock, &err);
+	if (zerocopy)
+		/* There are 256 bytes to be copied in skb, so there is enough
+		 * room for skb expand head in case it is used.
+		 * The rest buffer is mapped from userspace.
+		 */
+		copylen = GOODCOPY_LEN;
+	else
+		copylen = len;
+
+	skb = macvtap_alloc_skb(&q->sk, NET_IP_ALIGN, copylen,
+				vnet_hdr.hdr_len, noblock, &err);
 	if (!skb)
 		goto err;
 
-	err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len, len);
+	if (zerocopy)
+		err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
+	else
+		err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
+						   len);
+	if (sock_flag(&q->sk, SOCK_ZEROCOPY))
+		memcpy(&skb_shinfo(skb)->ubuf, m->msg_control,
+			sizeof(struct skb_ubuf_info));
 	if (err)
 		goto err_kfree;
 
@@ -579,7 +683,7 @@  static ssize_t macvtap_get_user(struct macvtap_queue *q,
 		kfree_skb(skb);
 	rcu_read_unlock_bh();
 
-	return count;
+	return total_len;
 
 err_kfree:
 	kfree_skb(skb);
@@ -601,8 +705,8 @@  static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
 	ssize_t result = -ENOLINK;
 	struct macvtap_queue *q = file->private_data;
 
-	result = macvtap_get_user(q, iv, iov_length(iv, count),
-			      file->f_flags & O_NONBLOCK);
+	result = macvtap_get_user(q, NULL, iv, iov_length(iv, count), count,
+				  file->f_flags & O_NONBLOCK);
 	return result;
 }
 
@@ -815,7 +919,7 @@  static int macvtap_sendmsg(struct kiocb *iocb, struct socket *sock,
 			   struct msghdr *m, size_t total_len)
 {
 	struct macvtap_queue *q = container_of(sock, struct macvtap_queue, sock);
-	return macvtap_get_user(q, m->msg_iov, total_len,
+	return macvtap_get_user(q, m, m->msg_iov, total_len, m->msg_iovlen,
 			    m->msg_flags & MSG_DONTWAIT);
 }