Message ID | 1288240804.14342.1.camel@localhost.localdomain |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
On Wed, Oct 27, 2010 at 09:40:04PM -0700, Shirley Ma wrote: > Resubmit this patch for fixing some minor error (white space, typo). > > Signed-off-by: Shirley Ma <xma@us.ibm.com> My concern is this can delay signalling for unlimited time. Could you pls test this with guests that do not have 2b5bbe3b8bee8b38bdc27dd9c0270829b6eb7eeb b0c39dbdc204006ef3558a66716ff09797619778 that is 2.6.31 and older? This seems to be slighltly out of spec, even though for TX, signals are less important. Two ideas: 1. How about writing out used, just delaying the signal? This way we don't have to queue separately. 2. How about flushing out queued stuff before we exit the handle_tx loop? That would address most of the spec issue. > --- > drivers/vhost/net.c | 20 +++++++++++++++++++- > drivers/vhost/vhost.c | 32 ++++++++++++++++++++++++++++++++ > drivers/vhost/vhost.h | 3 +++ > 3 files changed, 54 insertions(+), 1 deletions(-) > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > index 4b4da5b..3eb8016 100644 > --- a/drivers/vhost/net.c > +++ b/drivers/vhost/net.c > @@ -128,6 +128,7 @@ static void handle_tx(struct vhost_net *net) > int err, wmem; > size_t hdr_size; > struct socket *sock; > + int max_pend = vq->num - (vq->num >> 2); > > sock = rcu_dereference_check(vq->private_data, > lockdep_is_held(&vq->mutex)); > @@ -198,7 +199,24 @@ static void handle_tx(struct vhost_net *net) > if (err != len) > pr_debug("Truncated TX packet: " > " len %d != %zd\n", err, len); > - vhost_add_used_and_signal(&net->dev, vq, head, 0); > + /* > + * if no pending buffer size allocate, signal used buffer > + * one by one, otherwise, signal used buffer when reaching > + * 3/4 ring size to reduce CPU utilization. > + */ > + if (unlikely(vq->pend)) > + vhost_add_used_and_signal(&net->dev, vq, head, 0); > + else { > + vq->pend[vq->num_pend].id = head; > + vq->pend[vq->num_pend].len = 0; > + ++vq->num_pend; > + if (vq->num_pend == max_pend) { > + vhost_add_used_and_signal_n(&net->dev, vq, > + vq->pend, > + vq->num_pend); > + vq->num_pend = 0; > + } > + } > total_len += len; > if (unlikely(total_len >= VHOST_NET_WEIGHT)) { > vhost_poll_queue(&vq->poll); > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 94701ff..f2f3288 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -170,6 +170,16 @@ static void vhost_vq_reset(struct vhost_dev *dev, > vq->call_ctx = NULL; > vq->call = NULL; > vq->log_ctx = NULL; > + /* signal pending used buffers */ > + if (vq->pend) { > + if (vq->num_pend != 0) { > + vhost_add_used_and_signal_n(dev, vq, vq->pend, > + vq->num_pend); > + vq->num_pend = 0; > + } > + kfree(vq->pend); > + } > + vq->pend = NULL; > } > > static int vhost_worker(void *data) > @@ -273,7 +283,14 @@ long vhost_dev_init(struct vhost_dev *dev, > dev->vqs[i].heads = NULL; > dev->vqs[i].dev = dev; > mutex_init(&dev->vqs[i].mutex); > + dev->vqs[i].num_pend = 0; > + dev->vqs[i].pend = NULL; > vhost_vq_reset(dev, dev->vqs + i); > + /* signal 3/4 of ring size used buffers */ > + dev->vqs[i].pend = kmalloc((dev->vqs[i].num - > + (dev->vqs[i].num >> 2)) * > + sizeof *dev->vqs[i].pend, > + GFP_KERNEL); > if (dev->vqs[i].handle_kick) > vhost_poll_init(&dev->vqs[i].poll, > dev->vqs[i].handle_kick, POLLIN, dev); > @@ -599,6 +616,21 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) > r = -EINVAL; > break; > } > + if (vq->num != s.num) { > + /* signal used buffers first */ > + if (vq->pend) { > + if (vq->num_pend != 0) { > + vhost_add_used_and_signal_n(vq->dev, vq, > + vq->pend, > + vq->num_pend); > + vq->num_pend = 0; > + } > + kfree(vq->pend); > + } > + /* realloc pending used buffers size */ > + vq->pend = kmalloc((s.num - (s.num >> 2)) * > + sizeof *vq->pend, GFP_KERNEL); > + } > vq->num = s.num; > break; > case VHOST_SET_VRING_BASE: > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > index 073d06a..78949c0 100644 > --- a/drivers/vhost/vhost.h > +++ b/drivers/vhost/vhost.h > @@ -108,6 +108,9 @@ struct vhost_virtqueue { > /* Log write descriptors */ > void __user *log_base; > struct vhost_log *log; > + /* delay multiple used buffers to signal once */ > + int num_pend; > + struct vring_used_elem *pend; > }; > > struct vhost_dev { > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2010-10-28 at 07:20 +0200, Michael S. Tsirkin wrote: > My concern is this can delay signalling for unlimited time. > Could you pls test this with guests that do not have > 2b5bbe3b8bee8b38bdc27dd9c0270829b6eb7eeb > b0c39dbdc204006ef3558a66716ff09797619778 > that is 2.6.31 and older? I will test it out. > This seems to be slighltly out of spec, even though > for TX, signals are less important. > Two ideas: > 1. How about writing out used, just delaying the signal? > This way we don't have to queue separately. > 2. How about flushing out queued stuff before we exit > the handle_tx loop? That would address most of > the spec issue. I will modify the patch to test out the performance for both 1 & 2 approaches. Thanks Shirley -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> Two ideas: > 1. How about writing out used, just delaying the signal? > This way we don't have to queue separately. This improves some performance, but not as good as delaying both used and signal. Since delaying used buffers combining multiple small copies to a large copy, which saves more CPU utilization and increased some BW. > 2. How about flushing out queued stuff before we exit > the handle_tx loop? That would address most of > the spec issue. The performance is almost as same as the previous patch. I will resubmit the modified one, adding vhost_add_used_and_signal_n after handle_tx loop for processing pending queue. This patch was a part of modified macvtap zero copy which I haven't submitted yet. I found this helped vhost TX in general. This pending queue will be used by DMA done later, so I put it in vq instead of a local variable in handle_tx. Thanks Shirley -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2010-10-28 at 07:20 +0200, Michael S. Tsirkin wrote: > My concern is this can delay signalling for unlimited time. > Could you pls test this with guests that do not have > 2b5bbe3b8bee8b38bdc27dd9c0270829b6eb7eeb > b0c39dbdc204006ef3558a66716ff09797619778 > that is 2.6.31 and older? The patch only induces delay signaling unlimited time when there is no TX packet to transmit. I thought TX signaling only noticing guest to release the used buffers, anything else beside this? I tested rhel5u5 guest (2.6.18 kernel), it works fine. I checked the two commits log, I don't think this patch could cause any issue w/o these two patches. Also I found a big TX regression for old guest and new guest. For old guest, I am able to get almost 11Gb/s for 2K message size, but for the new guest kernel, I can only get 3.5 Gb/s with the patch and same host. I will dig it why. thanks Shirley -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2010-10-28 at 12:32 -0700, Shirley Ma wrote: > Also I found a big TX regression for old guest and new guest. For old > guest, I am able to get almost 11Gb/s for 2K message size, but for the > new guest kernel, I can only get 3.5 Gb/s with the patch and same > host. > I will dig it why. The regression is from guest kernel, not from this patch. Tested 2.6.31 kernel, it's performance is less than 2Gb/s for 2K message size already. I will resubmit the patch for review. I will start to test from 2.6.30 kernel to figure it when TX regression induced in virtio_net. Any suggestion which guest kernel I should test to figure out this regression? Thanks Shirley -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2010-10-28 at 13:13 -0700, Shirley Ma wrote: > On Thu, 2010-10-28 at 12:32 -0700, Shirley Ma wrote: > > Also I found a big TX regression for old guest and new guest. For old > > guest, I am able to get almost 11Gb/s for 2K message size, but for the > > new guest kernel, I can only get 3.5 Gb/s with the patch and same > > host. > > I will dig it why. > > The regression is from guest kernel, not from this patch. Tested 2.6.31 > kernel, it's performance is less than 2Gb/s for 2K message size already. > I will resubmit the patch for review. > > I will start to test from 2.6.30 kernel to figure it when TX regression > induced in virtio_net. Any suggestion which guest kernel I should test > to figure out this regression? It would be some change in virtio-net driver that may have improved the latency of small messages which in turn would have reduced the bandwidth as TCP could not accumulate and send large packets. Thanks Sridhar -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2010-10-28 at 14:04 -0700, Sridhar Samudrala wrote: > It would be some change in virtio-net driver that may have improved > the > latency of small messages which in turn would have reduced the > bandwidth > as TCP could not accumulate and send large packets. I will check out any latency improvement patch in virtio_net. If that's the case, whether it is good to have some tunable parameter to benefit both BW and latency workload? Shirley -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Oct 28, 2010 at 12:32:35PM -0700, Shirley Ma wrote: > On Thu, 2010-10-28 at 07:20 +0200, Michael S. Tsirkin wrote: > > My concern is this can delay signalling for unlimited time. > > Could you pls test this with guests that do not have > > 2b5bbe3b8bee8b38bdc27dd9c0270829b6eb7eeb > > b0c39dbdc204006ef3558a66716ff09797619778 > > that is 2.6.31 and older? > > The patch only induces delay signaling unlimited time when there is no > TX packet to transmit. I thought TX signaling only noticing guest to > release the used buffers, anything else beside this? Right, that's it I think. For newer kernels we orphan the skb on xmit so we don't care that much about completing them. This does rely on an undocumented assumption about guest behaviour though. > I tested rhel5u5 guest (2.6.18 kernel), it works fine. I checked the two > commits log, I don't think this patch could cause any issue w/o these > two patches. > > Also I found a big TX regression for old guest and new guest. For old > guest, I am able to get almost 11Gb/s for 2K message size, but for the > new guest kernel, I can only get 3.5 Gb/s with the patch and same host. > I will dig it why. > > thanks > Shirley -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Oct 28, 2010 at 10:14:22AM -0700, Shirley Ma wrote: > > > Two ideas: > > 1. How about writing out used, just delaying the signal? > > This way we don't have to queue separately. > > This improves some performance, but not as good as delaying > both used and signal. Since delaying used buffers combining > multiple small copies to a large copy, which saves more CPU > utilization and increased some BW. Hmm. I don't yet understand. We are still doing copies into the per-vq buffer, and the data copied is really small. Is it about cache line bounces? Could you try figuring it out? > > 2. How about flushing out queued stuff before we exit > > the handle_tx loop? That would address most of > > the spec issue. > > The performance is almost as same as the previous patch. I will resubmit > the modified one, adding vhost_add_used_and_signal_n after handle_tx > loop for processing pending queue. > > This patch was a part of modified macvtap zero copy which I haven't > submitted yet. I found this helped vhost TX in general. This pending > queue will be used by DMA done later, so I put it in vq instead of a > local variable in handle_tx. > > Thanks > Shirley BTW why do we need another array? Isn't heads field exactly what we need here? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Oct 28, 2010 at 02:40:50PM -0700, Shirley Ma wrote: > On Thu, 2010-10-28 at 14:04 -0700, Sridhar Samudrala wrote: > > It would be some change in virtio-net driver that may have improved > > the > > latency of small messages which in turn would have reduced the > > bandwidth > > as TCP could not accumulate and send large packets. > > I will check out any latency improvement patch in virtio_net. If that's > the case, whether it is good to have some tunable parameter to benefit > both BW and latency workload? > > Shirley No, we need it to work well automatically somehow.
On Thu, Oct 28, 2010 at 01:13:55PM -0700, Shirley Ma wrote: > On Thu, 2010-10-28 at 12:32 -0700, Shirley Ma wrote: > > Also I found a big TX regression for old guest and new guest. For old > > guest, I am able to get almost 11Gb/s for 2K message size, but for the > > new guest kernel, I can only get 3.5 Gb/s with the patch and same > > host. > > I will dig it why. > > The regression is from guest kernel, not from this patch. Tested 2.6.31 > kernel, it's performance is less than 2Gb/s for 2K message size already. > I will resubmit the patch for review. > > I will start to test from 2.6.30 kernel to figure it when TX regression > induced in virtio_net. Any suggestion which guest kernel I should test > to figure out this regression? > > Thanks > Shirley git bisect 2.6.31 2.6.30 and go from here. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2010-10-29 at 10:10 +0200, Michael S. Tsirkin wrote: > Hmm. I don't yet understand. We are still doing copies into the per-vq > buffer, and the data copied is really small. Is it about cache line > bounces? Could you try figuring it out? per-vq buffer is much less expensive than 3 put_copy() call. I will collect the profiling data to show that. > > > 2. How about flushing out queued stuff before we exit > > > the handle_tx loop? That would address most of > > > the spec issue. > > > > The performance is almost as same as the previous patch. I will > resubmit > > the modified one, adding vhost_add_used_and_signal_n after handle_tx > > loop for processing pending queue. > > > > This patch was a part of modified macvtap zero copy which I haven't > > submitted yet. I found this helped vhost TX in general. This pending > > queue will be used by DMA done later, so I put it in vq instead of a > > local variable in handle_tx. > > > > Thanks > > Shirley > > BTW why do we need another array? Isn't heads field exactly what we > need > here? head field is only for up to 32, the more used buffers add and signal accumulated the better performance is from test results. That's was one of the reason I didn't use heads. The other reason was I used these buffer for pending dma done in mavctap zero copy patch. It could be up to vq->num in worse case. Thanks Shirley -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 4b4da5b..3eb8016 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -128,6 +128,7 @@ static void handle_tx(struct vhost_net *net) int err, wmem; size_t hdr_size; struct socket *sock; + int max_pend = vq->num - (vq->num >> 2); sock = rcu_dereference_check(vq->private_data, lockdep_is_held(&vq->mutex)); @@ -198,7 +199,24 @@ static void handle_tx(struct vhost_net *net) if (err != len) pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); - vhost_add_used_and_signal(&net->dev, vq, head, 0); + /* + * if no pending buffer size allocate, signal used buffer + * one by one, otherwise, signal used buffer when reaching + * 3/4 ring size to reduce CPU utilization. + */ + if (unlikely(vq->pend)) + vhost_add_used_and_signal(&net->dev, vq, head, 0); + else { + vq->pend[vq->num_pend].id = head; + vq->pend[vq->num_pend].len = 0; + ++vq->num_pend; + if (vq->num_pend == max_pend) { + vhost_add_used_and_signal_n(&net->dev, vq, + vq->pend, + vq->num_pend); + vq->num_pend = 0; + } + } total_len += len; if (unlikely(total_len >= VHOST_NET_WEIGHT)) { vhost_poll_queue(&vq->poll); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 94701ff..f2f3288 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -170,6 +170,16 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq->call_ctx = NULL; vq->call = NULL; vq->log_ctx = NULL; + /* signal pending used buffers */ + if (vq->pend) { + if (vq->num_pend != 0) { + vhost_add_used_and_signal_n(dev, vq, vq->pend, + vq->num_pend); + vq->num_pend = 0; + } + kfree(vq->pend); + } + vq->pend = NULL; } static int vhost_worker(void *data) @@ -273,7 +283,14 @@ long vhost_dev_init(struct vhost_dev *dev, dev->vqs[i].heads = NULL; dev->vqs[i].dev = dev; mutex_init(&dev->vqs[i].mutex); + dev->vqs[i].num_pend = 0; + dev->vqs[i].pend = NULL; vhost_vq_reset(dev, dev->vqs + i); + /* signal 3/4 of ring size used buffers */ + dev->vqs[i].pend = kmalloc((dev->vqs[i].num - + (dev->vqs[i].num >> 2)) * + sizeof *dev->vqs[i].pend, + GFP_KERNEL); if (dev->vqs[i].handle_kick) vhost_poll_init(&dev->vqs[i].poll, dev->vqs[i].handle_kick, POLLIN, dev); @@ -599,6 +616,21 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) r = -EINVAL; break; } + if (vq->num != s.num) { + /* signal used buffers first */ + if (vq->pend) { + if (vq->num_pend != 0) { + vhost_add_used_and_signal_n(vq->dev, vq, + vq->pend, + vq->num_pend); + vq->num_pend = 0; + } + kfree(vq->pend); + } + /* realloc pending used buffers size */ + vq->pend = kmalloc((s.num - (s.num >> 2)) * + sizeof *vq->pend, GFP_KERNEL); + } vq->num = s.num; break; case VHOST_SET_VRING_BASE: diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 073d06a..78949c0 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -108,6 +108,9 @@ struct vhost_virtqueue { /* Log write descriptors */ void __user *log_base; struct vhost_log *log; + /* delay multiple used buffers to signal once */ + int num_pend; + struct vring_used_elem *pend; }; struct vhost_dev {
Resubmit this patch for fixing some minor error (white space, typo). Signed-off-by: Shirley Ma <xma@us.ibm.com> --- drivers/vhost/net.c | 20 +++++++++++++++++++- drivers/vhost/vhost.c | 32 ++++++++++++++++++++++++++++++++ drivers/vhost/vhost.h | 3 +++ 3 files changed, 54 insertions(+), 1 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html