Patchwork [2/2] vhost-net: add a spin_threshold parameter

login
register
mail settings
Submitter Anthony Liguori
Date Feb. 17, 2012, 11:02 p.m.
Message ID <1329519726-25763-3-git-send-email-aliguori@us.ibm.com>
Download mbox | patch
Permalink /patch/142021/
State Awaiting Upstream
Delegated to: David Miller
Headers show

Comments

Anthony Liguori - Feb. 17, 2012, 11:02 p.m.
With workloads that are dominated by very high rates of small packets, we see
considerable overhead in virtio notifications.

The best strategy we've been able to come up with to deal with this is adaptive
polling.  This patch simply adds the infrastructure needed to experiment with
polling strategies.  It is not meant for inclusion.

Here are the results with various polling values.  The spinning is not currently
a net win due to the high mutex contention caused by the broadcast wakeup.  With
a patch attempting to signal wakeup, we see up to 170+ transactions per second
with TCP_RR 60 instance.

N  Baseline	Spin 0		Spin 1000	Spin 5000

TCP_RR

1  9,639.66	10,164.06	9,825.43	9,827.45	101.95%
10 62,819.55	54,059.78	63,114.30	60,767.23	96.73%
30 84,715.60	131,241.86	120,922.38	89,776.39	105.97%
60 124,614.71	148,720.66	158,678.08	141,400.05	113.47%

UDP_RR

1  9,652.50	10,343.72	9,493.95	9,569.54	99.14%
10 53,830.26	58,235.90	50,145.29	48,820.53	90.69%
30 89,471.01	97,634.53	95,108.34	91,263.65	102.00%
60 103,640.59	164,035.01	157,002.22	128,646.73	124.13%

TCP_STREAM
1  2,622.63	2,610.71	2,688.49	2,678.61	102.13%
4  4,928.02	4,812.05	4,971.00	5,104.57	103.58%

1  5,639.89	5,751.28	5,819.81	5,593.62	99.18%
4  5,874.72	6,575.55	6,324.87	6,502.33	110.68%

1  6,257.42	7,655.22	7,610.52	7,424.74	118.65%
4  5,370.78	6,044.83	5,784.23	6,209.93	115.62%

1  6,346.63	7,267.44	7,567.39	7,677.93	120.98%
4  5,198.02	5,657.12	5,528.94	5,792.42	111.44%

TCP_MAERTS

1  2,091.38	1,765.62	2,142.56	2,312.94	110.59%
4  5,319.52	5,619.49	5,544.50	5,645.81	106.13%

1  7,030.66	7,593.61	7,575.67	7,622.07	108.41%
4  9,040.53	7,275.84	7,322.07	6,681.34	73.90%

1  9,160.93	9,318.15	9,065.82	8,586.82	93.73%
4  9,372.49	8,875.63	8,959.03	9,056.07	96.62%

1  9,183.28	9,134.02	8,945.12	8,657.72	94.28%
4  9,377.17	8,877.52	8,959.54	9,071.53	96.74%

Cc: Tom Lendacky <toml@us.ibm.com>
Cc: Cristian Viana <vianac@br.ibm.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 drivers/vhost/net.c |   14 ++++++++++++++
 1 files changed, 14 insertions(+), 0 deletions(-)
Michael S. Tsirkin - Feb. 19, 2012, 2:51 p.m.
On Fri, Feb 17, 2012 at 05:02:06PM -0600, Anthony Liguori wrote:
> With workloads that are dominated by very high rates of small packets, we see
> considerable overhead in virtio notifications.
> 
> The best strategy we've been able to come up with to deal with this is adaptive
> polling.
>  This patch simply adds the infrastructure needed to experiment with
> polling strategies.  It is not meant for inclusion.
> 
> Here are the results with various polling values.  The spinning is not currently
> a net win due to the high mutex contention caused by the broadcast wakeup.  With
> a patch attempting to signal wakeup, we see up to 170+ transactions per second
> with TCP_RR 60 instance.
> 
> N  Baseline	Spin 0		Spin 1000	Spin 5000
> 
> TCP_RR
> 
> 1  9,639.66	10,164.06	9,825.43	9,827.45	101.95%
> 10 62,819.55	54,059.78	63,114.30	60,767.23	96.73%
> 30 84,715.60	131,241.86	120,922.38	89,776.39	105.97%
> 60 124,614.71	148,720.66	158,678.08	141,400.05	113.47%
> 
> UDP_RR
> 
> 1  9,652.50	10,343.72	9,493.95	9,569.54	99.14%
> 10 53,830.26	58,235.90	50,145.29	48,820.53	90.69%
> 30 89,471.01	97,634.53	95,108.34	91,263.65	102.00%
> 60 103,640.59	164,035.01	157,002.22	128,646.73	124.13%
> 
> TCP_STREAM
> 1  2,622.63	2,610.71	2,688.49	2,678.61	102.13%
> 4  4,928.02	4,812.05	4,971.00	5,104.57	103.58%
> 
> 1  5,639.89	5,751.28	5,819.81	5,593.62	99.18%
> 4  5,874.72	6,575.55	6,324.87	6,502.33	110.68%
> 
> 1  6,257.42	7,655.22	7,610.52	7,424.74	118.65%
> 4  5,370.78	6,044.83	5,784.23	6,209.93	115.62%
> 
> 1  6,346.63	7,267.44	7,567.39	7,677.93	120.98%
> 4  5,198.02	5,657.12	5,528.94	5,792.42	111.44%
> 
> TCP_MAERTS
> 
> 1  2,091.38	1,765.62	2,142.56	2,312.94	110.59%
> 4  5,319.52	5,619.49	5,544.50	5,645.81	106.13%
> 
> 1  7,030.66	7,593.61	7,575.67	7,622.07	108.41%
> 4  9,040.53	7,275.84	7,322.07	6,681.34	73.90%
> 
> 1  9,160.93	9,318.15	9,065.82	8,586.82	93.73%
> 4  9,372.49	8,875.63	8,959.03	9,056.07	96.62%
> 
> 1  9,183.28	9,134.02	8,945.12	8,657.72	94.28%
> 4  9,377.17	8,877.52	8,959.54	9,071.53	96.74%

An obvious question would be how are BW divided by CPU
numbers affected.

> Cc: Tom Lendacky <toml@us.ibm.com>
> Cc: Cristian Viana <vianac@br.ibm.com>
> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
> ---
>  drivers/vhost/net.c |   14 ++++++++++++++
>  1 files changed, 14 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 47175cd..e9e5866 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -37,6 +37,10 @@ static int workers = 2;
>  module_param(workers, int, 0444);
>  MODULE_PARM_DESC(workers, "Set the number of worker threads");
>  
> +static ulong spin_threshold = 0;
> +module_param(spin_threshold, ulong, 0444);
> +MODULE_PARM_DESC(spin_threshold, "The polling threshold for the tx queue");
> +
>  /* Max number of bytes transferred before requeueing the job.
>   * Using this limit prevents one virtqueue from starving others. */
>  #define VHOST_NET_WEIGHT 0x80000
> @@ -65,6 +69,7 @@ struct vhost_net {
>  	 * We only do this when socket buffer fills up.
>  	 * Protected by tx vq lock. */
>  	enum vhost_net_poll_state tx_poll_state;
> +	size_t spin_threshold;
>  };
>  
>  static bool vhost_sock_zcopy(struct socket *sock)
> @@ -149,6 +154,7 @@ static void handle_tx(struct vhost_net *net)
>  	size_t hdr_size;
>  	struct socket *sock;
>  	struct vhost_ubuf_ref *uninitialized_var(ubufs);
> +	size_t spin_count;
>  	bool zcopy;
>  
>  	/* TODO: check that we are running from vhost_worker? */
> @@ -172,6 +178,7 @@ static void handle_tx(struct vhost_net *net)
>  	hdr_size = vq->vhost_hlen;
>  	zcopy = vhost_sock_zcopy(sock);
>  
> +	spin_count = 0;
>  	for (;;) {
>  		/* Release DMAs done buffers first */
>  		if (zcopy)
> @@ -205,9 +212,15 @@ static void handle_tx(struct vhost_net *net)
>  				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
>  				break;
>  			}
> +			if (spin_count < net->spin_threshold) {
> +				spin_count++;
> +				continue;
> +			}
>  			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>  				vhost_disable_notify(&net->dev, vq);
>  				continue;
> +			} else {
> +				spin_count = 0;
>  			}
>  			break;
>  		}
> @@ -506,6 +519,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
>  		return -ENOMEM;
>  
>  	dev = &n->dev;
> +	n->spin_threshold = spin_threshold;
>  	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
>  	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
>  	r = vhost_dev_init(dev, n->vqs, workers, VHOST_NET_VQ_MAX);
> -- 
> 1.7.4.1
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma - Feb. 21, 2012, 1:35 a.m.
We tried similar approach before by using a minimum timer for handle_tx
to stay in the loop to accumulate more packets before enabling the guest
notification. It did have better TCP_RRs, UDP_RRs results. However, we
think this is just a debug patch. We really need to understand why
handle_tx can't see more packets to process for multiple instances
request/response type of workload first. Spinning in this loop is not a
good solution.

Thanks
Shirley

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Wang - Feb. 21, 2012, 5:34 a.m.
On 02/21/2012 09:35 AM, Shirley Ma wrote:
> We tried similar approach before by using a minimum timer for handle_tx
> to stay in the loop to accumulate more packets before enabling the guest
> notification. It did have better TCP_RRs, UDP_RRs results. However, we
> think this is just a debug patch. We really need to understand why
> handle_tx can't see more packets to process for multiple instances
> request/response type of workload first. Spinning in this loop is not a
> good solution.

Spinning help for the latency, but looks like we need some adaptive 
method to adjust the threshold dynamically such as monitor the minimum 
time gap between two packets. For throughput, if we can improve the 
batching of small packets we can improve it. I've tired to use event 
index to delay the tx kick until a specified number of packets were 
batched in the virtqueue. Test shows improvement of throughput in small 
packets as the number of #exit were reduced greatly ( the packets/#exit 
and cpu utilization were increased), but it damages the performance of 
other. This is only for debug, but it confirms that there's something we 
need to improve the batching.

>
> Thanks
> Shirley
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma - Feb. 21, 2012, 6:28 a.m.
On Tue, 2012-02-21 at 13:34 +0800, Jason Wang wrote:
> On 02/21/2012 09:35 AM, Shirley Ma wrote:
> > We tried similar approach before by using a minimum timer for
> handle_tx
> > to stay in the loop to accumulate more packets before enabling the
> guest
> > notification. It did have better TCP_RRs, UDP_RRs results. However,
> we
> > think this is just a debug patch. We really need to understand why
> > handle_tx can't see more packets to process for multiple instances
> > request/response type of workload first. Spinning in this loop is
> not a
> > good solution.
> 
> Spinning help for the latency, but looks like we need some adaptive 
> method to adjust the threshold dynamically such as monitor the
> minimum 
> time gap between two packets. For throughput, if we can improve the 
> batching of small packets we can improve it. I've tired to use event 
> index to delay the tx kick until a specified number of packets were 
> batched in the virtqueue. Test shows improvement of throughput in
> small 
> packets as the number of #exit were reduced greatly ( the
> packets/#exit 
> and cpu utilization were increased), but it damages the performance
> of 
> other. This is only for debug, but it confirms that there's something
> we 
> need to improve the batching.

Our test case was 60 instances 256/256 bytes tcp_rrs or udp_rrs. In
theory there should be multiple packets in the queue by the time vhost
gets notified, but from debugging output, there was only a few or even
one packet in the queue. So the questions here why the time gap between
two packets is that big?

Shirley

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Wang - Feb. 21, 2012, 6:38 a.m.
On 02/21/2012 02:28 PM, Shirley Ma wrote:
> On Tue, 2012-02-21 at 13:34 +0800, Jason Wang wrote:
>> On 02/21/2012 09:35 AM, Shirley Ma wrote:
>>> We tried similar approach before by using a minimum timer for
>> handle_tx
>>> to stay in the loop to accumulate more packets before enabling the
>> guest
>>> notification. It did have better TCP_RRs, UDP_RRs results. However,
>> we
>>> think this is just a debug patch. We really need to understand why
>>> handle_tx can't see more packets to process for multiple instances
>>> request/response type of workload first. Spinning in this loop is
>> not a
>>> good solution.
>> Spinning help for the latency, but looks like we need some adaptive
>> method to adjust the threshold dynamically such as monitor the
>> minimum
>> time gap between two packets. For throughput, if we can improve the
>> batching of small packets we can improve it. I've tired to use event
>> index to delay the tx kick until a specified number of packets were
>> batched in the virtqueue. Test shows improvement of throughput in
>> small
>> packets as the number of #exit were reduced greatly ( the
>> packets/#exit
>> and cpu utilization were increased), but it damages the performance
>> of
>> other. This is only for debug, but it confirms that there's something
>> we
>> need to improve the batching.
> Our test case was 60 instances 256/256 bytes tcp_rrs or udp_rrs. In
> theory there should be multiple packets in the queue by the time vhost
> gets notified, but from debugging output, there was only a few or even
> one packet in the queue. So the questions here why the time gap between
> two packets is that big?
>
> Shirley

Not sure whether it's related but did you try to disable the nagle 
algorithm during the test?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma - Feb. 21, 2012, 11:09 a.m.
On Tue, 2012-02-21 at 14:38 +0800, Jason Wang wrote:
> Not sure whether it's related but did you try to disable the nagle 
> algorithm during the test?

UDP doesn't use nagle, TCP_RRs results had no difference disable/enable
nagle algorithm.

Thanks
Shirley

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sridhar Samudrala - Feb. 21, 2012, 4:08 p.m.
On Tue, 2012-02-21 at 14:38 +0800, Jason Wang wrote:
> On 02/21/2012 02:28 PM, Shirley Ma wrote:
> > On Tue, 2012-02-21 at 13:34 +0800, Jason Wang wrote:
> >> On 02/21/2012 09:35 AM, Shirley Ma wrote:
> >>> We tried similar approach before by using a minimum timer for
> >> handle_tx
> >>> to stay in the loop to accumulate more packets before enabling the
> >> guest
> >>> notification. It did have better TCP_RRs, UDP_RRs results. However,
> >> we
> >>> think this is just a debug patch. We really need to understand why
> >>> handle_tx can't see more packets to process for multiple instances
> >>> request/response type of workload first. Spinning in this loop is
> >> not a
> >>> good solution.
> >> Spinning help for the latency, but looks like we need some adaptive
> >> method to adjust the threshold dynamically such as monitor the
> >> minimum
> >> time gap between two packets. For throughput, if we can improve the
> >> batching of small packets we can improve it. I've tired to use event
> >> index to delay the tx kick until a specified number of packets were
> >> batched in the virtqueue. Test shows improvement of throughput in
> >> small
> >> packets as the number of #exit were reduced greatly ( the
> >> packets/#exit
> >> and cpu utilization were increased), but it damages the performance
> >> of
> >> other. This is only for debug, but it confirms that there's something
> >> we
> >> need to improve the batching.
> > Our test case was 60 instances 256/256 bytes tcp_rrs or udp_rrs. In
> > theory there should be multiple packets in the queue by the time vhost
> > gets notified, but from debugging output, there was only a few or even
> > one packet in the queue. So the questions here why the time gap between
> > two packets is that big?
> >
> > Shirley
> 
> Not sure whether it's related but did you try to disable the nagle 
> algorithm during the test?

This is a 60-instance 256 byte request-response test. So each request
for each instance is independent and is sub-mtu size and the next
request is not sent until the response is received. So Nagle doesn't
delay any packets in this workload.

Thanks
Sridhar

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dor Laor - March 12, 2012, 8:12 a.m.
On 02/18/2012 01:02 AM, Anthony Liguori wrote:
> With workloads that are dominated by very high rates of small packets, we see
> considerable overhead in virtio notifications.
>
> The best strategy we've been able to come up with to deal with this is adaptive
> polling.  This patch simply adds the infrastructure needed to experiment with
> polling strategies.  It is not meant for inclusion.
>
> Here are the results with various polling values.  The spinning is not currently
> a net win due to the high mutex contention caused by the broadcast wakeup.  With
> a patch attempting to signal wakeup, we see up to 170+ transactions per second
> with TCP_RR 60 instance.

Do you really consider to add busy loop to the code? There would 
probably be another way to solve it.

One of the options we talked about it to add a ethtool-controllable knob 
in the guest to set the min/max time for wakeup.

>
> N  Baseline	Spin 0		Spin 1000	Spin 5000
>
> TCP_RR
>
> 1  9,639.66	10,164.06	9,825.43	9,827.45	101.95%
> 10 62,819.55	54,059.78	63,114.30	60,767.23	96.73%
> 30 84,715.60	131,241.86	120,922.38	89,776.39	105.97%
> 60 124,614.71	148,720.66	158,678.08	141,400.05	113.47%
>
> UDP_RR
>
> 1  9,652.50	10,343.72	9,493.95	9,569.54	99.14%
> 10 53,830.26	58,235.90	50,145.29	48,820.53	90.69%
> 30 89,471.01	97,634.53	95,108.34	91,263.65	102.00%
> 60 103,640.59	164,035.01	157,002.22	128,646.73	124.13%
>
> TCP_STREAM
> 1  2,622.63	2,610.71	2,688.49	2,678.61	102.13%
> 4  4,928.02	4,812.05	4,971.00	5,104.57	103.58%
>
> 1  5,639.89	5,751.28	5,819.81	5,593.62	99.18%
> 4  5,874.72	6,575.55	6,324.87	6,502.33	110.68%
>
> 1  6,257.42	7,655.22	7,610.52	7,424.74	118.65%
> 4  5,370.78	6,044.83	5,784.23	6,209.93	115.62%
>
> 1  6,346.63	7,267.44	7,567.39	7,677.93	120.98%
> 4  5,198.02	5,657.12	5,528.94	5,792.42	111.44%
>
> TCP_MAERTS
>
> 1  2,091.38	1,765.62	2,142.56	2,312.94	110.59%
> 4  5,319.52	5,619.49	5,544.50	5,645.81	106.13%
>
> 1  7,030.66	7,593.61	7,575.67	7,622.07	108.41%
> 4  9,040.53	7,275.84	7,322.07	6,681.34	73.90%
>
> 1  9,160.93	9,318.15	9,065.82	8,586.82	93.73%
> 4  9,372.49	8,875.63	8,959.03	9,056.07	96.62%
>
> 1  9,183.28	9,134.02	8,945.12	8,657.72	94.28%
> 4  9,377.17	8,877.52	8,959.54	9,071.53	96.74%
>
> Cc: Tom Lendacky<toml@us.ibm.com>
> Cc: Cristian Viana<vianac@br.ibm.com>
> Signed-off-by: Anthony Liguori<aliguori@us.ibm.com>
> ---
>   drivers/vhost/net.c |   14 ++++++++++++++
>   1 files changed, 14 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 47175cd..e9e5866 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -37,6 +37,10 @@ static int workers = 2;
>   module_param(workers, int, 0444);
>   MODULE_PARM_DESC(workers, "Set the number of worker threads");
>
> +static ulong spin_threshold = 0;
> +module_param(spin_threshold, ulong, 0444);
> +MODULE_PARM_DESC(spin_threshold, "The polling threshold for the tx queue");
> +
>   /* Max number of bytes transferred before requeueing the job.
>    * Using this limit prevents one virtqueue from starving others. */
>   #define VHOST_NET_WEIGHT 0x80000
> @@ -65,6 +69,7 @@ struct vhost_net {
>   	 * We only do this when socket buffer fills up.
>   	 * Protected by tx vq lock. */
>   	enum vhost_net_poll_state tx_poll_state;
> +	size_t spin_threshold;
>   };
>
>   static bool vhost_sock_zcopy(struct socket *sock)
> @@ -149,6 +154,7 @@ static void handle_tx(struct vhost_net *net)
>   	size_t hdr_size;
>   	struct socket *sock;
>   	struct vhost_ubuf_ref *uninitialized_var(ubufs);
> +	size_t spin_count;
>   	bool zcopy;
>
>   	/* TODO: check that we are running from vhost_worker? */
> @@ -172,6 +178,7 @@ static void handle_tx(struct vhost_net *net)
>   	hdr_size = vq->vhost_hlen;
>   	zcopy = vhost_sock_zcopy(sock);
>
> +	spin_count = 0;
>   	for (;;) {
>   		/* Release DMAs done buffers first */
>   		if (zcopy)
> @@ -205,9 +212,15 @@ static void handle_tx(struct vhost_net *net)
>   				set_bit(SOCK_ASYNC_NOSPACE,&sock->flags);
>   				break;
>   			}
> +			if (spin_count<  net->spin_threshold) {
> +				spin_count++;
> +				continue;
> +			}
>   			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>   				vhost_disable_notify(&net->dev, vq);
>   				continue;
> +			} else {
> +				spin_count = 0;
>   			}
>   			break;
>   		}
> @@ -506,6 +519,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
>   		return -ENOMEM;
>
>   	dev =&n->dev;
> +	n->spin_threshold = spin_threshold;
>   	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
>   	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
>   	r = vhost_dev_init(dev, n->vqs, workers, VHOST_NET_VQ_MAX);

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 47175cd..e9e5866 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -37,6 +37,10 @@  static int workers = 2;
 module_param(workers, int, 0444);
 MODULE_PARM_DESC(workers, "Set the number of worker threads");
 
+static ulong spin_threshold = 0;
+module_param(spin_threshold, ulong, 0444);
+MODULE_PARM_DESC(spin_threshold, "The polling threshold for the tx queue");
+
 /* Max number of bytes transferred before requeueing the job.
  * Using this limit prevents one virtqueue from starving others. */
 #define VHOST_NET_WEIGHT 0x80000
@@ -65,6 +69,7 @@  struct vhost_net {
 	 * We only do this when socket buffer fills up.
 	 * Protected by tx vq lock. */
 	enum vhost_net_poll_state tx_poll_state;
+	size_t spin_threshold;
 };
 
 static bool vhost_sock_zcopy(struct socket *sock)
@@ -149,6 +154,7 @@  static void handle_tx(struct vhost_net *net)
 	size_t hdr_size;
 	struct socket *sock;
 	struct vhost_ubuf_ref *uninitialized_var(ubufs);
+	size_t spin_count;
 	bool zcopy;
 
 	/* TODO: check that we are running from vhost_worker? */
@@ -172,6 +178,7 @@  static void handle_tx(struct vhost_net *net)
 	hdr_size = vq->vhost_hlen;
 	zcopy = vhost_sock_zcopy(sock);
 
+	spin_count = 0;
 	for (;;) {
 		/* Release DMAs done buffers first */
 		if (zcopy)
@@ -205,9 +212,15 @@  static void handle_tx(struct vhost_net *net)
 				set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
 				break;
 			}
+			if (spin_count < net->spin_threshold) {
+				spin_count++;
+				continue;
+			}
 			if (unlikely(vhost_enable_notify(&net->dev, vq))) {
 				vhost_disable_notify(&net->dev, vq);
 				continue;
+			} else {
+				spin_count = 0;
 			}
 			break;
 		}
@@ -506,6 +519,7 @@  static int vhost_net_open(struct inode *inode, struct file *f)
 		return -ENOMEM;
 
 	dev = &n->dev;
+	n->spin_threshold = spin_threshold;
 	n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
 	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
 	r = vhost_dev_init(dev, n->vqs, workers, VHOST_NET_VQ_MAX);