diff mbox

[RFC,1/1] vhost: TX used buffer guest signal accumulation

Message ID 1288240804.14342.1.camel@localhost.localdomain
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Shirley Ma Oct. 28, 2010, 4:40 a.m. UTC
Resubmit this patch for fixing some minor error (white space, typo).

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---
 drivers/vhost/net.c   |   20 +++++++++++++++++++-
 drivers/vhost/vhost.c |   32 ++++++++++++++++++++++++++++++++
 drivers/vhost/vhost.h |    3 +++
 3 files changed, 54 insertions(+), 1 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Michael S. Tsirkin Oct. 28, 2010, 5:20 a.m. UTC | #1
On Wed, Oct 27, 2010 at 09:40:04PM -0700, Shirley Ma wrote:
> Resubmit this patch for fixing some minor error (white space, typo).
> 
> Signed-off-by: Shirley Ma <xma@us.ibm.com>

My concern is this can delay signalling for unlimited time.
Could you pls test this with guests that do not have
2b5bbe3b8bee8b38bdc27dd9c0270829b6eb7eeb
b0c39dbdc204006ef3558a66716ff09797619778
that is 2.6.31 and older?

This seems to be slighltly out of spec, even though
for TX, signals are less important.

Two ideas:
1. How about writing out used, just delaying the signal?
   This way we don't have to queue separately.
2. How about flushing out queued stuff before we exit
   the handle_tx loop? That would address most of
   the spec issue.

> ---
>  drivers/vhost/net.c   |   20 +++++++++++++++++++-
>  drivers/vhost/vhost.c |   32 ++++++++++++++++++++++++++++++++
>  drivers/vhost/vhost.h |    3 +++
>  3 files changed, 54 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 4b4da5b..3eb8016 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -128,6 +128,7 @@ static void handle_tx(struct vhost_net *net)
>  	int err, wmem;
>  	size_t hdr_size;
>  	struct socket *sock;
> +	int max_pend = vq->num - (vq->num >> 2);
>  
>  	sock = rcu_dereference_check(vq->private_data,
>  				     lockdep_is_held(&vq->mutex));
> @@ -198,7 +199,24 @@ static void handle_tx(struct vhost_net *net)
>  		if (err != len)
>  			pr_debug("Truncated TX packet: "
>  				 " len %d != %zd\n", err, len);
> -		vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +		/*
> +		 * if no pending buffer size allocate, signal used buffer
> +		 * one by one, otherwise, signal used buffer when reaching
> +		 * 3/4 ring size to reduce CPU utilization.
> +		 */
> +		if (unlikely(vq->pend))
> +			vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +		else {
> +			vq->pend[vq->num_pend].id = head;
> +			vq->pend[vq->num_pend].len = 0;
> +			++vq->num_pend;
> +			if (vq->num_pend == max_pend) {
> +				vhost_add_used_and_signal_n(&net->dev, vq,
> +							    vq->pend,
> +							    vq->num_pend);
> +				vq->num_pend = 0;
> +			}
> +		}
>  		total_len += len;
>  		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>  			vhost_poll_queue(&vq->poll);
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 94701ff..f2f3288 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -170,6 +170,16 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->call_ctx = NULL;
>  	vq->call = NULL;
>  	vq->log_ctx = NULL;
> +	/* signal pending used buffers */
> +	if (vq->pend) {
> +		if (vq->num_pend != 0) {
> +			vhost_add_used_and_signal_n(dev, vq, vq->pend,
> +						    vq->num_pend);
> +			vq->num_pend = 0;
> +		}
> +		kfree(vq->pend);
> +	}
> +	vq->pend = NULL;
>  }
>  
>  static int vhost_worker(void *data)
> @@ -273,7 +283,14 @@ long vhost_dev_init(struct vhost_dev *dev,
>  		dev->vqs[i].heads = NULL;
>  		dev->vqs[i].dev = dev;
>  		mutex_init(&dev->vqs[i].mutex);
> +		dev->vqs[i].num_pend = 0;
> +		dev->vqs[i].pend = NULL;
>  		vhost_vq_reset(dev, dev->vqs + i);
> +		/* signal 3/4 of ring size used buffers */
> +		dev->vqs[i].pend = kmalloc((dev->vqs[i].num -
> +					   (dev->vqs[i].num >> 2)) *
> +					   sizeof *dev->vqs[i].pend,
> +					   GFP_KERNEL);
>  		if (dev->vqs[i].handle_kick)
>  			vhost_poll_init(&dev->vqs[i].poll,
>  					dev->vqs[i].handle_kick, POLLIN, dev);
> @@ -599,6 +616,21 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
>  			r = -EINVAL;
>  			break;
>  		}
> +		if (vq->num != s.num) {
> +			/* signal used buffers first */
> +			if (vq->pend) {
> +				if (vq->num_pend != 0) {
> +					vhost_add_used_and_signal_n(vq->dev, vq,
> +								    vq->pend,
> +								    vq->num_pend);
> +					vq->num_pend = 0;
> +				}
> +				kfree(vq->pend);
> +			}
> +			/* realloc pending used buffers size */
> +			vq->pend = kmalloc((s.num - (s.num >> 2)) *
> +					   sizeof *vq->pend, GFP_KERNEL);
> +		}
>  		vq->num = s.num;
>  		break;
>  	case VHOST_SET_VRING_BASE:
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 073d06a..78949c0 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -108,6 +108,9 @@ struct vhost_virtqueue {
>  	/* Log write descriptors */
>  	void __user *log_base;
>  	struct vhost_log *log;
> +	/* delay multiple used buffers to signal once */
> +	int num_pend;
> +	struct vring_used_elem *pend;
>  };
>  
>  struct vhost_dev {
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma Oct. 28, 2010, 3:24 p.m. UTC | #2
On Thu, 2010-10-28 at 07:20 +0200, Michael S. Tsirkin wrote:
> My concern is this can delay signalling for unlimited time.
> Could you pls test this with guests that do not have
> 2b5bbe3b8bee8b38bdc27dd9c0270829b6eb7eeb
> b0c39dbdc204006ef3558a66716ff09797619778
> that is 2.6.31 and older?

I will test it out.

> This seems to be slighltly out of spec, even though
> for TX, signals are less important.
> Two ideas:
> 1. How about writing out used, just delaying the signal?
>    This way we don't have to queue separately.
> 2. How about flushing out queued stuff before we exit
>    the handle_tx loop? That would address most of
>    the spec issue. 

I will modify the patch to test out the performance for both 1 & 2
approaches.

Thanks
Shirley



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma Oct. 28, 2010, 5:14 p.m. UTC | #3
> Two ideas:
> 1. How about writing out used, just delaying the signal?
>    This way we don't have to queue separately.

This improves some performance, but not as good as delaying
both used and signal. Since delaying used buffers combining
multiple small copies to a large copy, which saves more CPU
utilization and increased some BW.

> 2. How about flushing out queued stuff before we exit
>    the handle_tx loop? That would address most of
>    the spec issue. 

The performance is almost as same as the previous patch. I will resubmit
the modified one, adding vhost_add_used_and_signal_n after handle_tx
loop for processing pending queue.

This patch was a part of modified macvtap zero copy which I haven't
submitted yet. I found this helped vhost TX in general. This pending
queue will be used by DMA done later, so I put it in vq instead of a
local variable in handle_tx.

Thanks
Shirley

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma Oct. 28, 2010, 7:32 p.m. UTC | #4
On Thu, 2010-10-28 at 07:20 +0200, Michael S. Tsirkin wrote:
> My concern is this can delay signalling for unlimited time.
> Could you pls test this with guests that do not have
> 2b5bbe3b8bee8b38bdc27dd9c0270829b6eb7eeb
> b0c39dbdc204006ef3558a66716ff09797619778
> that is 2.6.31 and older? 

The patch only induces delay signaling unlimited time when there is no
TX packet to transmit. I thought TX signaling only noticing guest to
release the used buffers, anything else beside this?

I tested rhel5u5 guest (2.6.18 kernel), it works fine. I checked the two
commits log, I don't think this patch could cause any issue w/o these
two patches.

Also I found a big TX regression for old guest and new guest. For old
guest, I am able to get almost 11Gb/s for 2K message size, but for the
new guest kernel, I can only get 3.5 Gb/s with the patch and same host.
I will dig it why.

thanks
Shirley

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma Oct. 28, 2010, 8:13 p.m. UTC | #5
On Thu, 2010-10-28 at 12:32 -0700, Shirley Ma wrote:
> Also I found a big TX regression for old guest and new guest. For old
> guest, I am able to get almost 11Gb/s for 2K message size, but for the
> new guest kernel, I can only get 3.5 Gb/s with the patch and same
> host.
> I will dig it why. 

The regression is from guest kernel, not from this patch. Tested 2.6.31
kernel, it's performance is less than 2Gb/s for 2K message size already.
I will resubmit the patch for review. 

I will start to test from 2.6.30 kernel to figure it when TX regression
induced in virtio_net. Any suggestion which guest kernel I should test
to figure out this regression?

Thanks
Shirley

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sridhar Samudrala Oct. 28, 2010, 9:04 p.m. UTC | #6
On Thu, 2010-10-28 at 13:13 -0700, Shirley Ma wrote:
> On Thu, 2010-10-28 at 12:32 -0700, Shirley Ma wrote:
> > Also I found a big TX regression for old guest and new guest. For old
> > guest, I am able to get almost 11Gb/s for 2K message size, but for the
> > new guest kernel, I can only get 3.5 Gb/s with the patch and same
> > host.
> > I will dig it why. 
> 
> The regression is from guest kernel, not from this patch. Tested 2.6.31
> kernel, it's performance is less than 2Gb/s for 2K message size already.
> I will resubmit the patch for review. 
> 
> I will start to test from 2.6.30 kernel to figure it when TX regression
> induced in virtio_net. Any suggestion which guest kernel I should test
> to figure out this regression?

It would be some change in virtio-net driver that may have improved the
latency of small messages which in turn would have reduced the bandwidth
as TCP could not accumulate and send large packets.

Thanks
Sridhar

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma Oct. 28, 2010, 9:40 p.m. UTC | #7
On Thu, 2010-10-28 at 14:04 -0700, Sridhar Samudrala wrote:
> It would be some change in virtio-net driver that may have improved
> the
> latency of small messages which in turn would have reduced the
> bandwidth
> as TCP could not accumulate and send large packets.

I will check out any latency improvement patch in virtio_net. If that's
the case, whether it is good to have some tunable parameter to benefit
both BW and latency workload?

Shirley 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Oct. 29, 2010, 8:03 a.m. UTC | #8
On Thu, Oct 28, 2010 at 12:32:35PM -0700, Shirley Ma wrote:
> On Thu, 2010-10-28 at 07:20 +0200, Michael S. Tsirkin wrote:
> > My concern is this can delay signalling for unlimited time.
> > Could you pls test this with guests that do not have
> > 2b5bbe3b8bee8b38bdc27dd9c0270829b6eb7eeb
> > b0c39dbdc204006ef3558a66716ff09797619778
> > that is 2.6.31 and older? 
> 
> The patch only induces delay signaling unlimited time when there is no
> TX packet to transmit. I thought TX signaling only noticing guest to
> release the used buffers, anything else beside this?

Right, that's it I think. For newer kernels we orphan the skb
on xmit so we don't care that much about completing them.
This does rely on an undocumented assumption about guest
behaviour though.

> I tested rhel5u5 guest (2.6.18 kernel), it works fine. I checked the two
> commits log, I don't think this patch could cause any issue w/o these
> two patches.
> 
> Also I found a big TX regression for old guest and new guest. For old
> guest, I am able to get almost 11Gb/s for 2K message size, but for the
> new guest kernel, I can only get 3.5 Gb/s with the patch and same host.
> I will dig it why.
> 
> thanks
> Shirley
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Oct. 29, 2010, 8:10 a.m. UTC | #9
On Thu, Oct 28, 2010 at 10:14:22AM -0700, Shirley Ma wrote:
> 
> > Two ideas:
> > 1. How about writing out used, just delaying the signal?
> >    This way we don't have to queue separately.
> 
> This improves some performance, but not as good as delaying
> both used and signal. Since delaying used buffers combining
> multiple small copies to a large copy, which saves more CPU
> utilization and increased some BW.

Hmm. I don't yet understand. We are still doing copies into the per-vq
buffer, and the data copied is really small.  Is it about cache line
bounces?  Could you try figuring it out?

> > 2. How about flushing out queued stuff before we exit
> >    the handle_tx loop? That would address most of
> >    the spec issue. 
> 
> The performance is almost as same as the previous patch. I will resubmit
> the modified one, adding vhost_add_used_and_signal_n after handle_tx
> loop for processing pending queue.
> 
> This patch was a part of modified macvtap zero copy which I haven't
> submitted yet. I found this helped vhost TX in general. This pending
> queue will be used by DMA done later, so I put it in vq instead of a
> local variable in handle_tx.
> 
> Thanks
> Shirley

BTW why do we need another array? Isn't heads field exactly what we need
here?

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Oct. 29, 2010, 8:11 a.m. UTC | #10
On Thu, Oct 28, 2010 at 02:40:50PM -0700, Shirley Ma wrote:
> On Thu, 2010-10-28 at 14:04 -0700, Sridhar Samudrala wrote:
> > It would be some change in virtio-net driver that may have improved
> > the
> > latency of small messages which in turn would have reduced the
> > bandwidth
> > as TCP could not accumulate and send large packets.
> 
> I will check out any latency improvement patch in virtio_net. If that's
> the case, whether it is good to have some tunable parameter to benefit
> both BW and latency workload?
> 
> Shirley 

No, we need it to work well automatically somehow.
Michael S. Tsirkin Oct. 29, 2010, 8:12 a.m. UTC | #11
On Thu, Oct 28, 2010 at 01:13:55PM -0700, Shirley Ma wrote:
> On Thu, 2010-10-28 at 12:32 -0700, Shirley Ma wrote:
> > Also I found a big TX regression for old guest and new guest. For old
> > guest, I am able to get almost 11Gb/s for 2K message size, but for the
> > new guest kernel, I can only get 3.5 Gb/s with the patch and same
> > host.
> > I will dig it why. 
> 
> The regression is from guest kernel, not from this patch. Tested 2.6.31
> kernel, it's performance is less than 2Gb/s for 2K message size already.
> I will resubmit the patch for review. 
> 
> I will start to test from 2.6.30 kernel to figure it when TX regression
> induced in virtio_net. Any suggestion which guest kernel I should test
> to figure out this regression?
> 
> Thanks
> Shirley

git bisect 2.6.31 2.6.30
and go from here.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma Oct. 29, 2010, 3:43 p.m. UTC | #12
On Fri, 2010-10-29 at 10:10 +0200, Michael S. Tsirkin wrote:
> Hmm. I don't yet understand. We are still doing copies into the per-vq
> buffer, and the data copied is really small.  Is it about cache line
> bounces?  Could you try figuring it out?

per-vq buffer is much less expensive than 3 put_copy() call. I will
collect the profiling data to show that.

> > > 2. How about flushing out queued stuff before we exit
> > >    the handle_tx loop? That would address most of
> > >    the spec issue. 
> > 
> > The performance is almost as same as the previous patch. I will
> resubmit
> > the modified one, adding vhost_add_used_and_signal_n after handle_tx
> > loop for processing pending queue.
> > 
> > This patch was a part of modified macvtap zero copy which I haven't
> > submitted yet. I found this helped vhost TX in general. This pending
> > queue will be used by DMA done later, so I put it in vq instead of a
> > local variable in handle_tx.
> > 
> > Thanks
> > Shirley
> 
> BTW why do we need another array? Isn't heads field exactly what we
> need
> here?

head field is only for up to 32, the more used buffers add and signal
accumulated the better performance is from test results. That's was one
of the reason I didn't use heads. The other reason was I used these
buffer for pending dma done in mavctap zero copy patch. It could be up
to vq->num in worse case.

Thanks
Shirley

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 4b4da5b..3eb8016 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -128,6 +128,7 @@  static void handle_tx(struct vhost_net *net)
 	int err, wmem;
 	size_t hdr_size;
 	struct socket *sock;
+	int max_pend = vq->num - (vq->num >> 2);
 
 	sock = rcu_dereference_check(vq->private_data,
 				     lockdep_is_held(&vq->mutex));
@@ -198,7 +199,24 @@  static void handle_tx(struct vhost_net *net)
 		if (err != len)
 			pr_debug("Truncated TX packet: "
 				 " len %d != %zd\n", err, len);
-		vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		/*
+		 * if no pending buffer size allocate, signal used buffer
+		 * one by one, otherwise, signal used buffer when reaching
+		 * 3/4 ring size to reduce CPU utilization.
+		 */
+		if (unlikely(vq->pend))
+			vhost_add_used_and_signal(&net->dev, vq, head, 0);
+		else {
+			vq->pend[vq->num_pend].id = head;
+			vq->pend[vq->num_pend].len = 0;
+			++vq->num_pend;
+			if (vq->num_pend == max_pend) {
+				vhost_add_used_and_signal_n(&net->dev, vq,
+							    vq->pend,
+							    vq->num_pend);
+				vq->num_pend = 0;
+			}
+		}
 		total_len += len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 94701ff..f2f3288 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -170,6 +170,16 @@  static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->call_ctx = NULL;
 	vq->call = NULL;
 	vq->log_ctx = NULL;
+	/* signal pending used buffers */
+	if (vq->pend) {
+		if (vq->num_pend != 0) {
+			vhost_add_used_and_signal_n(dev, vq, vq->pend,
+						    vq->num_pend);
+			vq->num_pend = 0;
+		}
+		kfree(vq->pend);
+	}
+	vq->pend = NULL;
 }
 
 static int vhost_worker(void *data)
@@ -273,7 +283,14 @@  long vhost_dev_init(struct vhost_dev *dev,
 		dev->vqs[i].heads = NULL;
 		dev->vqs[i].dev = dev;
 		mutex_init(&dev->vqs[i].mutex);
+		dev->vqs[i].num_pend = 0;
+		dev->vqs[i].pend = NULL;
 		vhost_vq_reset(dev, dev->vqs + i);
+		/* signal 3/4 of ring size used buffers */
+		dev->vqs[i].pend = kmalloc((dev->vqs[i].num -
+					   (dev->vqs[i].num >> 2)) *
+					   sizeof *dev->vqs[i].pend,
+					   GFP_KERNEL);
 		if (dev->vqs[i].handle_kick)
 			vhost_poll_init(&dev->vqs[i].poll,
 					dev->vqs[i].handle_kick, POLLIN, dev);
@@ -599,6 +616,21 @@  static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 			r = -EINVAL;
 			break;
 		}
+		if (vq->num != s.num) {
+			/* signal used buffers first */
+			if (vq->pend) {
+				if (vq->num_pend != 0) {
+					vhost_add_used_and_signal_n(vq->dev, vq,
+								    vq->pend,
+								    vq->num_pend);
+					vq->num_pend = 0;
+				}
+				kfree(vq->pend);
+			}
+			/* realloc pending used buffers size */
+			vq->pend = kmalloc((s.num - (s.num >> 2)) *
+					   sizeof *vq->pend, GFP_KERNEL);
+		}
 		vq->num = s.num;
 		break;
 	case VHOST_SET_VRING_BASE:
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 073d06a..78949c0 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -108,6 +108,9 @@  struct vhost_virtqueue {
 	/* Log write descriptors */
 	void __user *log_base;
 	struct vhost_log *log;
+	/* delay multiple used buffers to signal once */
+	int num_pend;
+	struct vring_used_elem *pend;
 };
 
 struct vhost_dev {